use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method findMoveTask.
public static Task<MoveWork> findMoveTask(List<Task<MoveWork>> mvTasks, FileSinkOperator fsOp) {
// find the move task
for (Task<MoveWork> mvTsk : mvTasks) {
MoveWork mvWork = mvTsk.getWork();
Path srcDir = null;
if (mvWork.getLoadFileWork() != null) {
srcDir = mvWork.getLoadFileWork().getSourcePath();
} else if (mvWork.getLoadTableWork() != null) {
srcDir = mvWork.getLoadTableWork().getSourcePath();
}
if ((srcDir != null) && (srcDir.equals(fsOp.getConf().getFinalDirName()))) {
return mvTsk;
}
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method mergeMovePaths.
/**
* Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
* This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
*
* @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
* @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
* @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
*/
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork) {
MoveWork newWork = new MoveWork(linkedMoveWork);
LoadFileDesc fileDesc = null;
LoadTableDesc tableDesc = null;
if (linkedMoveWork.getLoadFileWork() != null) {
fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
fileDesc.setSourcePath(condInputPath);
} else if (linkedMoveWork.getLoadTableWork() != null) {
tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
tableDesc.setSourcePath(condInputPath);
} else {
throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
}
newWork.setLoadFileWork(fileDesc);
newWork.setLoadTableWork(tableDesc);
return newWork;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method createCondTask.
/**
* Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
*
* @param conf
* HiveConf
* @param currTask
* current leaf task
* @param dummyMoveWork
* MoveWork for the move task
* @param mergeWork
* MapredWork for the merge task.
* @param condInputPath
* the input directory of the merge/move task
* @param condOutputPath
* the output directory of the merge/move task
* @param moveTaskToLink
* a MoveTask that may be linked to the conditional sub-tasks
* @param dependencyTask
* a dependency task that may be linked to the conditional sub-tasks
* @return The conditional task
*/
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork dummyMoveWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask) {
boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
MoveWork workForMoveOnlyTask;
if (shouldMergeMovePaths) {
workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork());
} else {
workForMoveOnlyTask = dummyMoveWork;
}
// There are 3 options for this ConditionalTask:
// 1) Merge the partitions
// 2) Move the partitions (i.e. don't merge the partitions)
// 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
// merge others) in this case the merge is done first followed by the move to prevent
// conflicts.
Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask, conf);
Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(dummyMoveWork, conf);
// NOTE! It is necessary merge task is the parent of the move task, and not
// the other way around, for the proper execution of the execute method of
// ConditionalTask
mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
List<Serializable> listWorks = new ArrayList<Serializable>();
listWorks.add(workForMoveOnlyTask);
listWorks.add(mergeWork);
ConditionalWork cndWork = new ConditionalWork(listWorks);
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
listTasks.add(moveOnlyMoveTask);
listTasks.add(mergeOnlyMergeTask);
listTasks.add(mergeAndMoveMergeTask);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
cndTsk.setListTasks(listTasks);
// create resolver
cndTsk.setResolver(new ConditionalResolverMergeFiles());
ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
cndTsk.setResolverCtx(mrCtx);
// make the conditional task as the child of the current leaf task
currTask.addDependentTask(cndTsk);
if (shouldMergeMovePaths) {
// If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
if (moveTaskToLink.getDependentTasks() != null) {
for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
moveOnlyMoveTask.addDependentTask(dependentTask);
}
}
} else {
addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
}
addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method findMoveTaskForFsopOutput.
public static Task<MoveWork> findMoveTaskForFsopOutput(List<Task<MoveWork>> mvTasks, Path fsopFinalDir, boolean isMmFsop) {
// find the move task
for (Task<MoveWork> mvTsk : mvTasks) {
MoveWork mvWork = mvTsk.getWork();
Path srcDir = null;
boolean isLfd = false;
if (mvWork.getLoadFileWork() != null) {
srcDir = mvWork.getLoadFileWork().getSourcePath();
isLfd = true;
if (isMmFsop) {
srcDir = srcDir.getParent();
}
} else if (mvWork.getLoadTableWork() != null) {
srcDir = mvWork.getLoadTableWork().getSourcePath();
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Observing MoveWork " + System.identityHashCode(mvWork) + " with " + srcDir + "(from " + (isLfd ? "LFD" : "LTD") + ") while looking for " + fsopFinalDir + "(mm = " + isMmFsop + ")");
}
if ((srcDir != null) && srcDir.equals(fsopFinalDir)) {
return mvTsk;
}
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class LoadPartitions method movePartitionTask.
/**
* This will create the move of partition data from temp path to actual path
*/
private Task<?> movePartitionTask(Table table, AddPartitionDesc.OnePartitionDesc partSpec, Path tmpPath) {
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), partSpec.getPartSpec(), event.replicationSpec().isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, SessionState.get().getTxnMgr().getCurrentTxnId());
loadTableWork.setInheritTableSpecs(false);
MoveWork work = new MoveWork(new HashSet<>(), new HashSet<>(), loadTableWork, null, false);
return TaskFactory.get(work, context.hiveConf);
}
Aggregations