use of org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData in project hive by apache.
the class IncrementalLoadTasksBuilder method build.
public Task<?> build(Context context, Hive hive, Logger log, TaskTracker tracker) throws Exception {
long builderStartTime = System.currentTimeMillis();
Task<?> evTaskRoot = TaskFactory.get(new DependencyCollectionWork());
Task<?> taskChainTail = evTaskRoot;
Long lastReplayedEvent = null;
this.log = log;
numIteration++;
this.log.debug("Iteration num " + numIteration);
while (iterator.hasNext() && tracker.canAddMoreTasks()) {
FileStatus dir = iterator.next();
String location = dir.getPath().toUri().toString();
DumpMetaData eventDmd = new DumpMetaData(new Path(location), conf);
if (!shouldReplayEvent(dir, eventDmd.getDumpType(), dbName)) {
this.log.debug("Skipping event {} from {} for DB {} maxTasks: {}", eventDmd.getDumpType(), dir.getPath().toUri(), dbName, tracker.numberOfTasks());
continue;
}
this.log.debug("Loading event {} from {} for DB {} maxTasks: {}", eventDmd.getDumpType(), dir.getPath().toUri(), dbName, tracker.numberOfTasks());
// event loads will behave similar to table loads, with one crucial difference
// precursor order is strict, and each event must be processed after the previous one.
// The way we handle this strict order is as follows:
// First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
// at the head of our event chain. For each event we process, we tell analyzeTableLoad to
// create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
// and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
// these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
// tasks as follows:
//
// --->ev1.task1-- --->ev2.task1--
// / \ / \
// evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
// \ /
// --->ev1.task3--
//
// Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
// entire chain
MessageHandler.Context mhContext = new MessageHandler.Context(dbName, location, taskChainTail, eventDmd, conf, hive, context, this.log, dumpDirectory, metricCollector);
List<Task<?>> evTasks = analyzeEventLoad(mhContext);
if ((evTasks != null) && (!evTasks.isEmpty())) {
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, metricCollector, dir.getPath().getName(), eventDmd.getDumpType().toString(), dumpDirectory);
Task<?> barrierTask = TaskFactory.get(replStateLogWork, conf);
AddDependencyToLeaves function = new AddDependencyToLeaves(barrierTask);
DAGTraversal.traverse(evTasks, function);
this.log.debug("Updated taskChainTail from {}:{} to {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
tracker.addTaskList(taskChainTail.getChildTasks());
taskChainTail = barrierTask;
}
lastReplayedEvent = eventDmd.getEventTo();
}
if (!hasMoreWork()) {
ReplRemoveFirstIncLoadPendFlagDesc desc = new ReplRemoveFirstIncLoadPendFlagDesc(dbName);
Task<?> updateIncPendTask = TaskFactory.get(new DDLWork(inputs, outputs, desc, true, dumpDirectory, this.metricCollector), conf);
taskChainTail.addDependentTask(updateIncPendTask);
taskChainTail = updateIncPendTask;
Map<String, String> dbProps = new HashMap<>();
dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID_SOURCE.toString(), String.valueOf(lastReplayedEvent));
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps, dumpDirectory, metricCollector, shouldFailover);
Task<?> barrierTask = TaskFactory.get(replStateLogWork, conf);
taskChainTail.addDependentTask(barrierTask);
this.log.debug("Added {}:{} as a precursor of barrier task {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
}
this.log.info("REPL_INCREMENTAL_LOAD task-builder iteration #{}, duration : {} ms", numIteration, System.currentTimeMillis() - builderStartTime);
return evTaskRoot;
}
use of org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData in project hive by apache.
the class ReplicationSemanticAnalyzer method analyzeReplLoad.
/*
* Example dump dirs we need to be able to handle :
*
* for: hive.repl.rootdir = staging/
* Then, repl dumps will be created in staging/<dumpdir>
*
* single-db-dump: staging/blah12345 will contain a db dir for the db specified
* blah12345/
* default/
* _metadata
* tbl1/
* _metadata
* dt=20160907/
* _files
* tbl2/
* tbl3/
* unptn_tbl/
* _metadata
* _files
*
* multi-db-dump: staging/bar12347 will contain dirs for each db covered
* staging/
* bar12347/
* default/
* ...
* sales/
* ...
*
* single table-dump: staging/baz123 will contain a table object dump inside
* staging/
* baz123/
* _metadata
* dt=20150931/
* _files
*
* incremental dump : staging/blue123 will contain dirs for each event inside.
* staging/
* blue123/
* 34/
* 35/
* 36/
*/
private void analyzeReplLoad(ASTNode ast) throws SemanticException {
try {
initReplLoad(ast);
} catch (HiveException e) {
throw new SemanticException(e);
}
// import job in its place.
try {
assert (sourceDbNameOrPattern != null);
Path loadPath = getCurrentLoadPath();
// If repl status of target is greater than dumps, don't do anything as the load for the latest dump is done
if (ReplUtils.failedWithNonRecoverableError(ReplUtils.getLatestDumpPath(ReplUtils.getEncodedDumpRootPath(conf, sourceDbNameOrPattern.toLowerCase()), conf), conf)) {
throw new Exception(ErrorMsg.REPL_FAILED_WITH_NON_RECOVERABLE_ERROR.getMsg());
}
if (loadPath != null) {
DumpMetaData dmd = new DumpMetaData(loadPath, conf);
boolean evDump = false;
// we will decide what hdfs locations needs to be copied over here as well.
if (dmd.isIncrementalDump()) {
LOG.debug("{} contains an incremental dump", loadPath);
evDump = true;
} else {
LOG.debug("{} contains an bootstrap dump", loadPath);
}
ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), sourceDbNameOrPattern, replScope.getDbName(), dmd.getReplScope(), queryState.getLineageState(), evDump, dmd.getEventTo(), dmd.getDumpExecutionId(), initMetricCollection(!evDump, loadPath.toString(), replScope.getDbName(), dmd.getDumpExecutionId()), dmd.isReplScopeModified());
rootTasks.add(TaskFactory.get(replLoadWork, conf));
} else {
LOG.warn("Previous Dump Already Loaded");
}
} catch (Exception e) {
// TODO : simple wrap & rethrow for now, clean up with error codes
throw new SemanticException(e.getMessage(), e);
}
}
use of org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData in project hive by apache.
the class TestReplDumpTask method removeDBPropertyToPreventRenameWhenBootstrapDumpOfTableFails.
@Test(expected = TestException.class)
public void removeDBPropertyToPreventRenameWhenBootstrapDumpOfTableFails() throws Exception {
List<String> tableList = Arrays.asList("a1", "a2");
String dbRandomKey = "akeytoberandom";
ReplScope replScope = new ReplScope("default");
mockStatic(Utils.class);
when(Utils.matchesDb(same(hive), eq("default"))).thenReturn(Collections.singletonList("default"));
when(Utils.getAllTables(same(hive), eq("default"), eq(replScope))).thenReturn(tableList);
when(Utils.setDbBootstrapDumpState(same(hive), eq("default"))).thenReturn(dbRandomKey);
when(Utils.matchesTbl(same(hive), eq("default"), eq(replScope))).thenReturn(tableList);
when(hive.getAllFunctions()).thenReturn(Collections.emptyList());
when(queryState.getConf()).thenReturn(conf);
when(conf.getLong("hive.repl.last.repl.id", -1L)).thenReturn(1L);
when(conf.getBoolVar(HiveConf.ConfVars.REPL_INCLUDE_EXTERNAL_TABLES)).thenReturn(false);
when(HiveConf.getVar(conf, HiveConf.ConfVars.REPL_BOOTSTRAP_DUMP_OPEN_TXN_TIMEOUT)).thenReturn("1h");
whenNew(HiveWrapper.class).withAnyArguments().thenReturn(mock(HiveWrapper.class));
ReplDumpTask task = new StubReplDumpTask() {
private int tableDumpCount = 0;
@Override
void dumpTable(String dbName, String tblName, String validTxnList, Path dbRootMetadata, Path dbRootData, long lastReplId, Hive hiveDb, HiveWrapper.Tuple<Table> tuple, FileList managedTableDirFileList, boolean dataCopyAtLoad) throws Exception {
tableDumpCount++;
if (tableDumpCount > 1) {
throw new TestException();
}
}
};
task.initialize(queryState, null, null, null);
ReplDumpWork replDumpWork = new ReplDumpWork(replScope, "", "");
replDumpWork.setMetricCollector(metricCollector);
task.setWork(replDumpWork);
try {
task.bootStrapDump(new Path("mock"), new DumpMetaData(new Path("mock"), conf), mock(Path.class), hive);
} finally {
Utils.resetDbBootstrapDumpState(same(hive), eq("default"), eq(dbRandomKey));
}
}
use of org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData in project hive by apache.
the class AlterDatabaseHandler method handle.
@Override
public void handle(Context withinContext) throws Exception {
LOG.info("Processing#{} ALTER_DATABASE message : {}", fromEventId(), eventMessageAsJSON);
DumpMetaData dmd = withinContext.createDmd(this);
dmd.setPayload(eventMessageAsJSON);
dmd.write();
}
use of org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData in project hive by apache.
the class CommitTxnHandler method handle.
@Override
public void handle(Context withinContext) throws Exception {
if (!ReplUtils.includeAcidTableInDump(withinContext.hiveConf)) {
return;
}
LOG.info("Processing#{} COMMIT_TXN message : {}", fromEventId(), eventMessageAsJSON);
String payload = eventMessageAsJSON;
if (!withinContext.hiveConf.getBoolVar(HiveConf.ConfVars.REPL_DUMP_METADATA_ONLY)) {
boolean replicatingAcidEvents = true;
if (withinContext.hiveConf.getBoolVar(HiveConf.ConfVars.REPL_BOOTSTRAP_ACID_TABLES)) {
// We do not dump ACID table related events when taking a bootstrap dump of ACID tables as
// part of an incremental dump. So we shouldn't be dumping any changes to ACID table as
// part of the commit. At the same time we need to dump the commit transaction event so
// that replication can end a transaction opened when replaying open transaction event.
LOG.debug("writeEventsInfoList will be removed from commit message because we are " + "bootstrapping acid tables.");
replicatingAcidEvents = false;
} else if (!ReplUtils.includeAcidTableInDump(withinContext.hiveConf)) {
// Similar to the above condition, only for testing purposes, if the config doesn't allow
// ACID tables to be replicated, we don't dump any changes to the ACID tables as part of
// commit.
LOG.debug("writeEventsInfoList will be removed from commit message because we are " + "not dumping acid tables.");
replicatingAcidEvents = false;
}
List<WriteEventInfo> writeEventInfoList = null;
if (replicatingAcidEvents) {
writeEventInfoList = getAllWriteEventInfo(withinContext);
}
int numEntry = (writeEventInfoList != null ? writeEventInfoList.size() : 0);
if (numEntry != 0) {
eventMessage.addWriteEventInfo(writeEventInfoList);
payload = jsonMessageEncoder.getSerializer().serialize(eventMessage);
LOG.debug("payload for commit txn event : " + eventMessageAsJSON);
}
org.apache.hadoop.hive.ql.metadata.Table qlMdTablePrev = null;
org.apache.hadoop.hive.ql.metadata.Table qlMdTable = null;
List<Partition> qlPtns = new ArrayList<>();
List<List<String>> filesTobeAdded = new ArrayList<>();
// used during import, so we need not dump the latest table metadata.
for (int idx = 0; idx < numEntry; idx++) {
qlMdTable = new org.apache.hadoop.hive.ql.metadata.Table(eventMessage.getTableObj(idx));
if (qlMdTablePrev == null) {
qlMdTablePrev = qlMdTable;
}
// one dump directory per table
if (!qlMdTablePrev.getCompleteName().equals(qlMdTable.getCompleteName())) {
createDumpFileForTable(withinContext, qlMdTablePrev, qlPtns, filesTobeAdded);
qlPtns = new ArrayList<>();
filesTobeAdded = new ArrayList<>();
qlMdTablePrev = qlMdTable;
}
if (qlMdTable.isPartitioned() && (null != eventMessage.getPartitionObj(idx))) {
qlPtns.add(new org.apache.hadoop.hive.ql.metadata.Partition(qlMdTable, eventMessage.getPartitionObj(idx)));
}
filesTobeAdded.add(Lists.newArrayList(ReplChangeManager.getListFromSeparatedString(eventMessage.getFiles(idx))));
}
// Dump last table in the list
if (qlMdTablePrev != null) {
createDumpFileForTable(withinContext, qlMdTablePrev, qlPtns, filesTobeAdded);
}
}
DumpMetaData dmd = withinContext.createDmd(this);
dmd.setPayload(payload);
dmd.write();
}
Aggregations