use of org.apache.hadoop.hive.ql.plan.DependencyCollectionWork in project hive by apache.
the class CreateFunctionHandler method handle.
@Override
public List<Task<? extends Serializable>> handle(Context context) throws SemanticException {
try {
FunctionDescBuilder builder = new FunctionDescBuilder(context);
CreateFunctionDesc descToLoad = builder.build();
this.functionName = builder.metadata.function.getFunctionName();
context.log.debug("Loading function desc : {}", descToLoad.toString());
Task<FunctionWork> createTask = TaskFactory.get(new FunctionWork(descToLoad));
context.log.debug("Added create function task : {}:{},{}", createTask.getId(), descToLoad.getFunctionName(), descToLoad.getClassName());
// different handlers, unless this is a common pattern that is seen, leaving this here.
if (context.dmd != null) {
updatedMetadata.set(context.dmd.getEventTo().toString(), builder.destinationDbName, null, null);
}
readEntitySet.add(toReadEntity(new Path(context.location), context.hiveConf));
if (builder.replCopyTasks.isEmpty()) {
// reply copy only happens for jars on hdfs not otherwise.
return Collections.singletonList(createTask);
} else {
/**
* This is to understand how task dependencies work.
* All root tasks are executed in parallel. For bootstrap replication there should be only one root task of creating db. Incremental can be multiple ( have to verify ).
* Task has children, which are put in queue for execution after the parent has finished execution.
* One -to- One dependency can be satisfied by adding children to a given task, do this recursively where the relation holds.
* for many to one , create a barrier task that is the child of every item in 'many' dependencies, make the 'one' dependency as child of barrier task.
* add the 'many' to parent/root tasks. The execution environment will make sure that the child barrier task will not get executed unless all parents of the barrier task are complete,
* which should only happen when the last task is finished, at which point the child of the barrier task is picked up.
*/
Task<? extends Serializable> barrierTask = TaskFactory.get(new DependencyCollectionWork());
builder.replCopyTasks.forEach(t -> t.addDependentTask(barrierTask));
barrierTask.addDependentTask(createTask);
return builder.replCopyTasks;
}
} catch (Exception e) {
throw (e instanceof SemanticException) ? (SemanticException) e : new SemanticException("Error reading message members", e);
}
}
use of org.apache.hadoop.hive.ql.plan.DependencyCollectionWork in project hive by apache.
the class ReplicationSemanticAnalyzer method analyzeReplLoad.
/*
* Example dump dirs we need to be able to handle :
*
* for: hive.repl.rootdir = staging/
* Then, repl dumps will be created in staging/<dumpdir>
*
* single-db-dump: staging/blah12345 will contain a db dir for the db specified
* blah12345/
* default/
* _metadata
* tbl1/
* _metadata
* dt=20160907/
* _files
* tbl2/
* tbl3/
* unptn_tbl/
* _metadata
* _files
*
* multi-db-dump: staging/bar12347 will contain dirs for each db covered
* staging/
* bar12347/
* default/
* ...
* sales/
* ...
*
* single table-dump: staging/baz123 will contain a table object dump inside
* staging/
* baz123/
* _metadata
* dt=20150931/
* _files
*
* incremental dump : staging/blue123 will contain dirs for each event inside.
* staging/
* blue123/
* 34/
* 35/
* 36/
*/
private void analyzeReplLoad(ASTNode ast) throws SemanticException {
LOG.debug("ReplSemanticAnalyzer.analyzeReplLoad: " + String.valueOf(dbNameOrPattern) + "." + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(path));
try {
Path loadPath = new Path(path);
final FileSystem fs = loadPath.getFileSystem(conf);
if (!fs.exists(loadPath)) {
// supposed dump path does not exist.
throw new FileNotFoundException(loadPath.toUri().toString());
}
// Now, the dumped path can be one of three things:
// a) It can be a db dump, in which case we expect a set of dirs, each with a
// db name, and with a _metadata file in each, and table dirs inside that.
// b) It can be a table dump dir, in which case we expect a _metadata dump of
// a table in question in the dir, and individual ptn dir hierarchy.
// c) A dump can be an incremental dump, which means we have several subdirs
// each of which have the evid as the dir name, and each of which correspond
// to a event-level dump. Currently, only CREATE_TABLE and ADD_PARTITION are
// handled, so all of these dumps will be at a table/ptn level.
// For incremental repl, we will have individual events which can
// be other things like roles and fns as well.
// At this point, all dump dirs should contain a _dumpmetadata file that
// tells us what is inside that dumpdir.
DumpMetaData dmd = new DumpMetaData(loadPath, conf);
boolean evDump = false;
if (dmd.isIncrementalDump()) {
LOG.debug("{} contains an incremental dump", loadPath);
evDump = true;
} else {
LOG.debug("{} contains an bootstrap dump", loadPath);
}
if ((!evDump) && (tblNameOrPattern != null) && !(tblNameOrPattern.isEmpty())) {
ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, tblNameOrPattern, queryState.getLineageState(), SessionState.get().getTxnMgr().getCurrentTxnId());
rootTasks.add(TaskFactory.get(replLoadWork, conf));
return;
}
FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath);
if (srcs == null || (srcs.length == 0)) {
LOG.warn("Nothing to load at {}", loadPath.toUri().toString());
return;
}
FileStatus[] dirsInLoadPath = fs.listStatus(loadPath, EximUtil.getDirectoryFilter(fs));
if ((dirsInLoadPath == null) || (dirsInLoadPath.length == 0)) {
throw new IllegalArgumentException("No data to load in path " + loadPath.toUri().toString());
}
if (!evDump) {
// not an event dump, not a table dump - thus, a db dump
if ((dbNameOrPattern != null) && (dirsInLoadPath.length > 1)) {
LOG.debug("Found multiple dirs when we expected 1:");
for (FileStatus d : dirsInLoadPath) {
LOG.debug("> " + d.getPath().toUri().toString());
}
throw new IllegalArgumentException("Multiple dirs in " + loadPath.toUri().toString() + " does not correspond to REPL LOAD expecting to load to a singular destination point.");
}
ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, queryState.getLineageState(), SessionState.get().getTxnMgr().getCurrentTxnId());
rootTasks.add(TaskFactory.get(replLoadWork, conf));
//
// for (FileStatus dir : dirsInLoadPath) {
// analyzeDatabaseLoad(dbNameOrPattern, fs, dir);
// }
} else {
// Event dump, each sub-dir is an individual event dump.
// We need to guarantee that the directory listing we got is in order of evid.
Arrays.sort(dirsInLoadPath, new EventDumpDirComparator());
Task<? extends Serializable> evTaskRoot = TaskFactory.get(new DependencyCollectionWork());
Task<? extends Serializable> taskChainTail = evTaskRoot;
ReplLogger replLogger = new IncrementalLoadLogger(dbNameOrPattern, loadPath.toString(), dirsInLoadPath.length);
for (FileStatus dir : dirsInLoadPath) {
LOG.debug("Loading event from {} to {}.{}", dir.getPath().toUri(), dbNameOrPattern, tblNameOrPattern);
// event loads will behave similar to table loads, with one crucial difference
// precursor order is strict, and each event must be processed after the previous one.
// The way we handle this strict order is as follows:
// First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
// at the head of our event chain. For each event we process, we tell analyzeTableLoad to
// create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
// and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
// these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
// tasks as follows:
//
// --->ev1.task1-- --->ev2.task1--
// / \ / \
// evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
// \ /
// --->ev1.task3--
//
// Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
// entire chain
String locn = dir.getPath().toUri().toString();
DumpMetaData eventDmd = new DumpMetaData(new Path(locn), conf);
MessageHandler.Context context = new MessageHandler.Context(dbNameOrPattern, tblNameOrPattern, locn, taskChainTail, eventDmd, conf, db, ctx, LOG);
List<Task<? extends Serializable>> evTasks = analyzeEventLoad(context);
if ((evTasks != null) && (!evTasks.isEmpty())) {
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dir.getPath().getName(), eventDmd.getDumpType().toString());
Task<? extends Serializable> barrierTask = TaskFactory.get(replStateLogWork);
for (Task<? extends Serializable> t : evTasks) {
t.addDependentTask(barrierTask);
LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", t.getClass(), t.getId(), barrierTask.getClass(), barrierTask.getId());
}
LOG.debug("Updated taskChainTail from {}:{} to {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
taskChainTail = barrierTask;
}
}
// If any event is there and db name is known, then dump the start and end logs
if (!evTaskRoot.equals(taskChainTail)) {
Map<String, String> dbProps = new HashMap<>();
dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID.toString(), String.valueOf(dmd.getEventTo()));
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps);
Task<? extends Serializable> barrierTask = TaskFactory.get(replStateLogWork);
taskChainTail.addDependentTask(barrierTask);
LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
replLogger.startLog();
}
rootTasks.add(evTaskRoot);
}
} catch (Exception e) {
// TODO : simple wrap & rethrow for now, clean up with error codes
throw new SemanticException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.DependencyCollectionWork in project hive by apache.
the class ReplicationSemanticAnalyzer method addUpdateReplStateTasks.
private List<Task<? extends Serializable>> addUpdateReplStateTasks(boolean isDatabaseLoad, UpdatedMetaDataTracker updatedMetadata, List<Task<? extends Serializable>> importTasks) {
String replState = updatedMetadata.getReplicationState();
String dbName = updatedMetadata.getDatabase();
String tableName = updatedMetadata.getTable();
// need to update the repl state to any object.
if (importTasks.isEmpty() || (!isDatabaseLoad && (tableName == null))) {
LOG.debug("No objects need update of repl state: Either 0 import tasks or table level load");
return importTasks;
}
// Create a barrier task for dependency collection of import tasks
Task<? extends Serializable> barrierTask = TaskFactory.get(new DependencyCollectionWork());
// Link import tasks to the barrier task which will in-turn linked with repl state update tasks
for (Task<? extends Serializable> t : importTasks) {
t.addDependentTask(barrierTask);
LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", t.getClass(), t.getId(), barrierTask.getClass(), barrierTask.getId());
}
List<Task<? extends Serializable>> tasks = new ArrayList<>();
Task<? extends Serializable> updateReplIdTask;
// If any partition is updated, then update repl state in partition object
for (final Map<String, String> partSpec : updatedMetadata.getPartitions()) {
updateReplIdTask = tableUpdateReplStateTask(dbName, tableName, partSpec, replState, barrierTask);
tasks.add(updateReplIdTask);
}
if (tableName != null) {
// If any table/partition is updated, then update repl state in table object
updateReplIdTask = tableUpdateReplStateTask(dbName, tableName, null, replState, barrierTask);
tasks.add(updateReplIdTask);
}
// For table level load, need not update replication state for the database
if (isDatabaseLoad) {
// If any table/partition is updated, then update repl state in db object
updateReplIdTask = dbUpdateReplStateTask(dbName, replState, barrierTask);
tasks.add(updateReplIdTask);
}
// At least one task would have been added to update the repl state
return tasks;
}
use of org.apache.hadoop.hive.ql.plan.DependencyCollectionWork in project hive by apache.
the class GenMapRedUtils method createCondTask.
/**
* Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
*
* @param conf
* HiveConf
* @param currTask
* current leaf task
* @param dummyMoveWork
* MoveWork for the move task
* @param mergeWork
* MapredWork for the merge task.
* @param condInputPath
* the input directory of the merge/move task
* @param condOutputPath
* the output directory of the merge/move task
* @param moveTaskToLink
* a MoveTask that may be linked to the conditional sub-tasks
* @param dependencyTask
* a dependency task that may be linked to the conditional sub-tasks
* @param lineageState
* to track activity
* @return The conditional task
*/
@SuppressWarnings("unchecked")
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask, LineageState lineageState) {
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Creating conditional merge task for " + condInputPath);
}
// Create a dummy task if no move is needed.
Serializable moveWork = mvWork != null ? mvWork : new DependencyCollectionWork();
// Note: this should never happen for mm tables.
boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
Serializable workForMoveOnlyTask = moveWork;
if (shouldMergeMovePaths) {
workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork(), lineageState);
}
// There are 3 options for this ConditionalTask:
// 1) Merge the partitions
// 2) Move the partitions (i.e. don't merge the partitions)
// 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
// merge others) in this case the merge is done first followed by the move to prevent
// conflicts.
// TODO: if we are not dealing with concatenate DDL, we should not create a merge+move path
// because it should be impossible to get incompatible outputs.
Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork);
Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask);
Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork);
Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(moveWork);
// NOTE! It is necessary merge task is the parent of the move task, and not
// the other way around, for the proper execution of the execute method of
// ConditionalTask
mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
List<Serializable> listWorks = new ArrayList<Serializable>();
listWorks.add(workForMoveOnlyTask);
listWorks.add(mergeWork);
ConditionalWork cndWork = new ConditionalWork(listWorks);
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
listTasks.add(moveOnlyMoveTask);
listTasks.add(mergeOnlyMergeTask);
listTasks.add(mergeAndMoveMergeTask);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
// create resolver
cndTsk.setResolver(new ConditionalResolverMergeFiles());
ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
cndTsk.setResolverCtx(mrCtx);
// make the conditional task as the child of the current leaf task
currTask.addDependentTask(cndTsk);
if (shouldMergeMovePaths) {
// If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
if (moveTaskToLink.getDependentTasks() != null) {
for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
moveOnlyMoveTask.addDependentTask(dependentTask);
}
}
} else {
addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
}
addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.plan.DependencyCollectionWork in project hive by apache.
the class AddDependencyToLeavesTest method shouldNotSkipIntermediateDependencyCollectionTasks.
@Test
public void shouldNotSkipIntermediateDependencyCollectionTasks() {
Task<DependencyCollectionWork> collectionWorkTaskOne = TaskFactory.get(new DependencyCollectionWork());
Task<DependencyCollectionWork> collectionWorkTaskTwo = TaskFactory.get(new DependencyCollectionWork());
Task<DependencyCollectionWork> collectionWorkTaskThree = TaskFactory.get(new DependencyCollectionWork());
@SuppressWarnings("unchecked") Task<? extends Serializable> rootTask = mock(Task.class);
when(rootTask.getDependentTasks()).thenReturn(Arrays.asList(collectionWorkTaskOne, collectionWorkTaskTwo, collectionWorkTaskThree));
@SuppressWarnings("unchecked") List<Task<? extends Serializable>> tasksPostCurrentGraph = Arrays.asList(mock(Task.class), mock(Task.class));
DAGTraversal.traverse(Collections.singletonList(rootTask), new AddDependencyToLeaves(tasksPostCurrentGraph));
List<Task<? extends Serializable>> dependentTasksForOne = collectionWorkTaskOne.getDependentTasks();
List<Task<? extends Serializable>> dependentTasksForTwo = collectionWorkTaskTwo.getDependentTasks();
List<Task<? extends Serializable>> dependentTasksForThree = collectionWorkTaskThree.getDependentTasks();
assertEquals(dependentTasksForOne.size(), 2);
assertEquals(dependentTasksForTwo.size(), 2);
assertEquals(dependentTasksForThree.size(), 2);
assertTrue(tasksPostCurrentGraph.containsAll(dependentTasksForOne));
assertTrue(tasksPostCurrentGraph.containsAll(dependentTasksForTwo));
assertTrue(tasksPostCurrentGraph.containsAll(dependentTasksForThree));
// assertTrue(dependentTasksForOne.iterator().next() instanceof DependencyCollectionTask);
// assertTrue(dependentTasksForTwo.iterator().next() instanceof DependencyCollectionTask);
// assertTrue(dependentTasksForThree.iterator().next() instanceof DependencyCollectionTask);
}
Aggregations