use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.
the class SortMergeJoinTaskDispatcher method processCurrentTask.
@Override
public Task<?> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
// whether it contains a sort merge join operator
MapredWork currWork = currTask.getWork();
SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
if (!isEligibleForOptimization(originalSMBJoinOp)) {
return null;
}
currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
// Convert the work containing to sort-merge join into a work, as if it had a regular join.
// Note that the operator tree is not changed - is still contains the SMB join, but the
// plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
// This is used to convert the plan to a map-join, and then the original SMB join plan is used
// as a backup task.
MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
currWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
currJoinWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currJoinWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currJoinWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
// create conditional work list and task list
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<?>> listTasks = new ArrayList<Task<?>>();
// create task to aliases mapping and alias to input file mapping for resolver
// Must be deterministic order map for consistent q-test output across Java versions
HashMap<Task<?>, Set<String>> taskToAliases = new LinkedHashMap<Task<?>, Set<String>>();
// Note that pathToAlias will behave as if the original plan was a join plan
Map<Path, List<String>> pathToAliases = currJoinWork.getMapWork().getPathToAliases();
// generate a map join task for the big table
SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
Byte[] order = originalSMBJoinDesc.getTagOrder();
int numAliases = order.length;
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
HashMap<String, Long> aliasToSize = new HashMap<String, Long>();
Configuration conf = context.getConf();
try {
long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currJoinWork.getMapWork(), pathToAliases, aliasToSize);
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
// this table cannot be big table
if (!bigTableCandidates.contains(bigTablePosition)) {
continue;
}
// create map join task for the given big table position
MapRedTask newTask = convertSMBTaskToMapJoinTask(currJoinWork, bigTablePosition, newSMBJoinOp);
MapWork mapWork = newTask.getWork().getMapWork();
Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (aliasKnownSize > 0) {
long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
// this table is not good to be a big table.
continue;
}
}
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
newTask.setFetchSource(currTask.isFetchSource());
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
throw new SemanticException("Generate Map Join Task Error", e);
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.getMapWork().setLeftInputJoin(false);
currWork.getMapWork().setBaseSrc(null);
currWork.getMapWork().setMapAliases(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
// set resolver and resolver context
cndTsk.setResolver(new ConditionalResolverCommonJoin());
ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
resolverCtx.setPathToAliases(pathToAliases);
resolverCtx.setAliasToKnownSize(aliasToSize);
resolverCtx.setTaskToAliases(taskToAliases);
resolverCtx.setCommonJoinTask(currTask);
resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
cndTsk.setResolverCtx(resolverCtx);
// replace the current task with the new generated conditional task
replaceTaskWithConditionalTask(currTask, cndTsk);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.
the class TezCompiler method setInputFormat.
@Override
protected void setInputFormat(Task<?> task) {
if (task instanceof TezTask) {
TezWork work = ((TezTask) task).getWork();
List<BaseWork> all = work.getAllWork();
for (BaseWork w : all) {
if (w instanceof MergeJoinWork) {
MergeJoinWork mj = (MergeJoinWork) w;
setInputFormatForMapWork(mj.getMainWork());
for (BaseWork bw : mj.getBaseWorkList()) {
setInputFormatForMapWork(bw);
}
} else {
setInputFormatForMapWork(w);
}
}
} else if (task instanceof ConditionalTask) {
List<Task<?>> listTasks = ((ConditionalTask) task).getListTasks();
for (Task<?> tsk : listTasks) {
setInputFormat(tsk);
}
}
if (task.getChildTasks() != null) {
for (Task<?> childTask : task.getChildTasks()) {
setInputFormat(childTask);
}
}
}
use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsNotOptimized.
@Test
public void testConditionalMoveTaskIsNotOptimized() throws SemanticException {
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
Path tableLocation = new Path("s3a://bucket/warehouse/table");
Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
Task<?> moveOnlyTask = conditionalTask.getListTasks().get(0);
Task<?> mergeOnlyTask = conditionalTask.getListTasks().get(1);
Task<?> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
// Verify moveOnlyTask is NOT optimized
assertEquals(1, moveOnlyTask.getChildTasks().size());
verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeOnlyTask is NOT optimized
assertEquals(1, mergeOnlyTask.getChildTasks().size());
verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeAndMoveTask is NOT optimized
assertEquals(1, mergeAndMoveTask.getChildTasks().size());
assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsOptimized.
@Test
public void testConditionalMoveTaskIsOptimized() throws SemanticException {
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
Path tableLocation = new Path("s3a://bucket/warehouse/table");
Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
Task<?> moveOnlyTask = conditionalTask.getListTasks().get(0);
Task<?> mergeOnlyTask = conditionalTask.getListTasks().get(1);
Task<?> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
/*
* OPTIMIZATION
* The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of
* linking, it creates one MoveTask where the source is the first MoveTask source, and target is the
* second MoveTask target.
*/
// Verify moveOnlyTask is optimized
assertNull(moveOnlyTask.getChildTasks());
verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation);
// Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed)
assertEquals(1, mergeOnlyTask.getChildTasks().size());
verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeAndMoveTask is NOT optimized
assertEquals(1, mergeAndMoveTask.getChildTasks().size());
assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.
the class MapReduceCompiler method setInputFormat.
// loop over all the tasks recursively
@Override
protected void setInputFormat(Task<?> task) {
if (task instanceof ExecDriver) {
MapWork work = ((MapredWork) task.getWork()).getMapWork();
Map<String, Operator<? extends OperatorDesc>> opMap = work.getAliasToWork();
if (!opMap.isEmpty()) {
for (Operator<? extends OperatorDesc> op : opMap.values()) {
setInputFormat(work, op);
}
}
} else if (task instanceof ConditionalTask) {
List<Task<?>> listTasks = ((ConditionalTask) task).getListTasks();
for (Task<?> tsk : listTasks) {
setInputFormat(tsk);
}
}
if (task.getChildTasks() != null) {
for (Task<?> childTask : task.getChildTasks()) {
setInputFormat(childTask);
}
}
}
Aggregations