use of org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles in project hive by apache.
the class GenMapRedUtils method createCondTask.
/**
* Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
*
* @param conf
* HiveConf
* @param currTask
* current leaf task
* @param dummyMoveWork
* MoveWork for the move task
* @param mergeWork
* MapredWork for the merge task.
* @param condInputPath
* the input directory of the merge/move task
* @param condOutputPath
* the output directory of the merge/move task
* @param moveTaskToLink
* a MoveTask that may be linked to the conditional sub-tasks
* @param dependencyTask
* a dependency task that may be linked to the conditional sub-tasks
* @return The conditional task
*/
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork dummyMoveWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask) {
boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
MoveWork workForMoveOnlyTask;
if (shouldMergeMovePaths) {
workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork());
} else {
workForMoveOnlyTask = dummyMoveWork;
}
// There are 3 options for this ConditionalTask:
// 1) Merge the partitions
// 2) Move the partitions (i.e. don't merge the partitions)
// 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
// merge others) in this case the merge is done first followed by the move to prevent
// conflicts.
Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask, conf);
Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(dummyMoveWork, conf);
// NOTE! It is necessary merge task is the parent of the move task, and not
// the other way around, for the proper execution of the execute method of
// ConditionalTask
mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
List<Serializable> listWorks = new ArrayList<Serializable>();
listWorks.add(workForMoveOnlyTask);
listWorks.add(mergeWork);
ConditionalWork cndWork = new ConditionalWork(listWorks);
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
listTasks.add(moveOnlyMoveTask);
listTasks.add(mergeOnlyMergeTask);
listTasks.add(mergeAndMoveMergeTask);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
cndTsk.setListTasks(listTasks);
// create resolver
cndTsk.setResolver(new ConditionalResolverMergeFiles());
ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
cndTsk.setResolverCtx(mrCtx);
// make the conditional task as the child of the current leaf task
currTask.addDependentTask(cndTsk);
if (shouldMergeMovePaths) {
// If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
if (moveTaskToLink.getDependentTasks() != null) {
for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
moveOnlyMoveTask.addDependentTask(dependentTask);
}
}
} else {
addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
}
addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
return cndTsk;
}
Aggregations