use of org.apache.hadoop.hive.ql.exec.repl.bootstrap.ReplLoadWork in project hive by apache.
the class ReplicationSemanticAnalyzer method analyzeReplLoad.
/*
* Example dump dirs we need to be able to handle :
*
* for: hive.repl.rootdir = staging/
* Then, repl dumps will be created in staging/<dumpdir>
*
* single-db-dump: staging/blah12345 will contain a db dir for the db specified
* blah12345/
* default/
* _metadata
* tbl1/
* _metadata
* dt=20160907/
* _files
* tbl2/
* tbl3/
* unptn_tbl/
* _metadata
* _files
*
* multi-db-dump: staging/bar12347 will contain dirs for each db covered
* staging/
* bar12347/
* default/
* ...
* sales/
* ...
*
* single table-dump: staging/baz123 will contain a table object dump inside
* staging/
* baz123/
* _metadata
* dt=20150931/
* _files
*
* incremental dump : staging/blue123 will contain dirs for each event inside.
* staging/
* blue123/
* 34/
* 35/
* 36/
*/
private void analyzeReplLoad(ASTNode ast) throws SemanticException {
LOG.debug("ReplSemanticAnalyzer.analyzeReplLoad: " + String.valueOf(dbNameOrPattern) + "." + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(path));
try {
Path loadPath = new Path(path);
final FileSystem fs = loadPath.getFileSystem(conf);
if (!fs.exists(loadPath)) {
// supposed dump path does not exist.
throw new FileNotFoundException(loadPath.toUri().toString());
}
// Now, the dumped path can be one of three things:
// a) It can be a db dump, in which case we expect a set of dirs, each with a
// db name, and with a _metadata file in each, and table dirs inside that.
// b) It can be a table dump dir, in which case we expect a _metadata dump of
// a table in question in the dir, and individual ptn dir hierarchy.
// c) A dump can be an incremental dump, which means we have several subdirs
// each of which have the evid as the dir name, and each of which correspond
// to a event-level dump. Currently, only CREATE_TABLE and ADD_PARTITION are
// handled, so all of these dumps will be at a table/ptn level.
// For incremental repl, we will have individual events which can
// be other things like roles and fns as well.
// At this point, all dump dirs should contain a _dumpmetadata file that
// tells us what is inside that dumpdir.
DumpMetaData dmd = new DumpMetaData(loadPath, conf);
boolean evDump = false;
if (dmd.isIncrementalDump()) {
LOG.debug("{} contains an incremental dump", loadPath);
evDump = true;
} else {
LOG.debug("{} contains an bootstrap dump", loadPath);
}
if ((!evDump) && (tblNameOrPattern != null) && !(tblNameOrPattern.isEmpty())) {
ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, tblNameOrPattern, queryState.getLineageState(), SessionState.get().getTxnMgr().getCurrentTxnId());
rootTasks.add(TaskFactory.get(replLoadWork, conf));
return;
}
FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath);
if (srcs == null || (srcs.length == 0)) {
LOG.warn("Nothing to load at {}", loadPath.toUri().toString());
return;
}
FileStatus[] dirsInLoadPath = fs.listStatus(loadPath, EximUtil.getDirectoryFilter(fs));
if ((dirsInLoadPath == null) || (dirsInLoadPath.length == 0)) {
throw new IllegalArgumentException("No data to load in path " + loadPath.toUri().toString());
}
if (!evDump) {
// not an event dump, not a table dump - thus, a db dump
if ((dbNameOrPattern != null) && (dirsInLoadPath.length > 1)) {
LOG.debug("Found multiple dirs when we expected 1:");
for (FileStatus d : dirsInLoadPath) {
LOG.debug("> " + d.getPath().toUri().toString());
}
throw new IllegalArgumentException("Multiple dirs in " + loadPath.toUri().toString() + " does not correspond to REPL LOAD expecting to load to a singular destination point.");
}
ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, queryState.getLineageState(), SessionState.get().getTxnMgr().getCurrentTxnId());
rootTasks.add(TaskFactory.get(replLoadWork, conf));
//
// for (FileStatus dir : dirsInLoadPath) {
// analyzeDatabaseLoad(dbNameOrPattern, fs, dir);
// }
} else {
// Event dump, each sub-dir is an individual event dump.
// We need to guarantee that the directory listing we got is in order of evid.
Arrays.sort(dirsInLoadPath, new EventDumpDirComparator());
Task<? extends Serializable> evTaskRoot = TaskFactory.get(new DependencyCollectionWork());
Task<? extends Serializable> taskChainTail = evTaskRoot;
ReplLogger replLogger = new IncrementalLoadLogger(dbNameOrPattern, loadPath.toString(), dirsInLoadPath.length);
for (FileStatus dir : dirsInLoadPath) {
LOG.debug("Loading event from {} to {}.{}", dir.getPath().toUri(), dbNameOrPattern, tblNameOrPattern);
// event loads will behave similar to table loads, with one crucial difference
// precursor order is strict, and each event must be processed after the previous one.
// The way we handle this strict order is as follows:
// First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
// at the head of our event chain. For each event we process, we tell analyzeTableLoad to
// create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
// and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
// these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
// tasks as follows:
//
// --->ev1.task1-- --->ev2.task1--
// / \ / \
// evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
// \ /
// --->ev1.task3--
//
// Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
// entire chain
String locn = dir.getPath().toUri().toString();
DumpMetaData eventDmd = new DumpMetaData(new Path(locn), conf);
MessageHandler.Context context = new MessageHandler.Context(dbNameOrPattern, tblNameOrPattern, locn, taskChainTail, eventDmd, conf, db, ctx, LOG);
List<Task<? extends Serializable>> evTasks = analyzeEventLoad(context);
if ((evTasks != null) && (!evTasks.isEmpty())) {
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dir.getPath().getName(), eventDmd.getDumpType().toString());
Task<? extends Serializable> barrierTask = TaskFactory.get(replStateLogWork);
for (Task<? extends Serializable> t : evTasks) {
t.addDependentTask(barrierTask);
LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", t.getClass(), t.getId(), barrierTask.getClass(), barrierTask.getId());
}
LOG.debug("Updated taskChainTail from {}:{} to {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
taskChainTail = barrierTask;
}
}
// If any event is there and db name is known, then dump the start and end logs
if (!evTaskRoot.equals(taskChainTail)) {
Map<String, String> dbProps = new HashMap<>();
dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID.toString(), String.valueOf(dmd.getEventTo()));
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps);
Task<? extends Serializable> barrierTask = TaskFactory.get(replStateLogWork);
taskChainTail.addDependentTask(barrierTask);
LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
replLogger.startLog();
}
rootTasks.add(evTaskRoot);
}
} catch (Exception e) {
// TODO : simple wrap & rethrow for now, clean up with error codes
throw new SemanticException(e);
}
}
Aggregations