use of org.apache.hadoop.hive.ql.exec.MoveTask in project hive by apache.
the class GenMapRedUtils method isMergeRequired.
/**
* Returns true iff the fsOp requires a merge
* @param mvTasks
* @param hconf
* @param fsOp
* @param currTask
* @param isInsertTable
* @return
*/
public static boolean isMergeRequired(List<Task<MoveWork>> mvTasks, HiveConf hconf, FileSinkOperator fsOp, Task<? extends Serializable> currTask, boolean isInsertTable) {
// Has the user enabled merging of files for map-only jobs or for all jobs
if ((mvTasks != null) && (!mvTasks.isEmpty())) {
// no need of merging if the move is to a local file system
MoveTask mvTask = (MoveTask) GenMapRedUtils.findMoveTask(mvTasks, fsOp);
if (mvTask != null && isInsertTable && hconf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER) && !fsOp.getConf().isMaterialization()) {
// mark the MapredWork and FileSinkOperator for gathering stats
fsOp.getConf().setGatherStats(true);
fsOp.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
if (!mvTask.hasFollowingStatsTask()) {
GenMapRedUtils.addStatsTask(fsOp, mvTask, currTask, hconf);
}
}
if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) {
if (currTask.getWork() instanceof TezWork) {
// config
return hconf.getBoolVar(ConfVars.HIVEMERGETEZFILES);
} else if (currTask.getWork() instanceof SparkWork) {
// spark has its own config for merging
return hconf.getBoolVar(ConfVars.HIVEMERGESPARKFILES);
}
if (fsOp.getConf().isLinkedFileSink()) {
// possibly by a big margin. So, merge aggresively.
if (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) || hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES)) {
return true;
}
} else {
// or for a map-reduce job
if (currTask.getWork() instanceof MapredWork) {
ReduceWork reduceWork = ((MapredWork) currTask.getWork()).getReduceWork();
boolean mergeMapOnly = hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && reduceWork == null;
boolean mergeMapRed = hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) && reduceWork != null;
if (mergeMapOnly || mergeMapRed) {
return true;
}
} else {
return false;
}
}
}
}
return false;
}
use of org.apache.hadoop.hive.ql.exec.MoveTask in project hive by apache.
the class GenMapRedUtils method createCondTask.
/**
* Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
*
* @param conf
* HiveConf
* @param currTask
* current leaf task
* @param dummyMoveWork
* MoveWork for the move task
* @param mergeWork
* MapredWork for the merge task.
* @param condInputPath
* the input directory of the merge/move task
* @param condOutputPath
* the output directory of the merge/move task
* @param moveTaskToLink
* a MoveTask that may be linked to the conditional sub-tasks
* @param dependencyTask
* a dependency task that may be linked to the conditional sub-tasks
* @return The conditional task
*/
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork dummyMoveWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask) {
boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
MoveWork workForMoveOnlyTask;
if (shouldMergeMovePaths) {
workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork());
} else {
workForMoveOnlyTask = dummyMoveWork;
}
// There are 3 options for this ConditionalTask:
// 1) Merge the partitions
// 2) Move the partitions (i.e. don't merge the partitions)
// 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
// merge others) in this case the merge is done first followed by the move to prevent
// conflicts.
Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask, conf);
Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(dummyMoveWork, conf);
// NOTE! It is necessary merge task is the parent of the move task, and not
// the other way around, for the proper execution of the execute method of
// ConditionalTask
mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
List<Serializable> listWorks = new ArrayList<Serializable>();
listWorks.add(workForMoveOnlyTask);
listWorks.add(mergeWork);
ConditionalWork cndWork = new ConditionalWork(listWorks);
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
listTasks.add(moveOnlyMoveTask);
listTasks.add(mergeOnlyMergeTask);
listTasks.add(mergeAndMoveMergeTask);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
cndTsk.setListTasks(listTasks);
// create resolver
cndTsk.setResolver(new ConditionalResolverMergeFiles());
ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
cndTsk.setResolverCtx(mrCtx);
// make the conditional task as the child of the current leaf task
currTask.addDependentTask(cndTsk);
if (shouldMergeMovePaths) {
// If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
if (moveTaskToLink.getDependentTasks() != null) {
for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
moveOnlyMoveTask.addDependentTask(dependentTask);
}
}
} else {
addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
}
addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
return cndTsk;
}
Aggregations