use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class CrossProductHandler method dispatch.
@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
@SuppressWarnings("unchecked") Task<?> currTask = (Task<?>) nd;
if (currTask instanceof MapRedTask) {
MapRedTask mrTsk = (MapRedTask) currTask;
MapredWork mrWrk = mrTsk.getWork();
checkMapJoins(mrTsk);
checkMRReducer(currTask.toString(), mrWrk);
} else if (currTask instanceof ConditionalTask) {
List<Task<?>> taskListInConditionalTask = ((ConditionalTask) currTask).getListTasks();
for (Task<?> tsk : taskListInConditionalTask) {
dispatch(tsk, stack, nodeOutputs);
}
} else if (currTask instanceof TezTask) {
TezTask tezTask = (TezTask) currTask;
TezWork tezWork = tezTask.getWork();
checkMapJoins(tezWork);
checkTezReducer(tezWork);
}
return null;
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class CommonJoinTaskDispatcher method processCurrentTask.
@Override
public Task<?> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
// whether it contains common join op; if contains, return this common join op
JoinOperator joinOp = getJoinOp(currTask);
if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
return null;
}
currTask.setTaskTag(Task.COMMON_JOIN);
MapWork currWork = currTask.getWork().getMapWork();
// create conditional work list and task list
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<?>> listTasks = new ArrayList<Task<?>>();
// create task to aliases mapping and alias to input file mapping for resolver
// Must be deterministic order map for consistent q-test output across Java versions
HashMap<Task<?>, Set<String>> taskToAliases = new LinkedHashMap<Task<?>, Set<String>>();
Map<Path, List<String>> pathToAliases = currWork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();
// start to generate multiple map join tasks
JoinDesc joinDesc = joinOp.getConf();
if (aliasToSize == null) {
aliasToSize = new HashMap<String, Long>();
}
try {
long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
// no table could be the big table; there is no need to convert
if (bigTableCandidates.isEmpty()) {
return null;
}
// if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
// only contain multi-sourced because multi-sourced cannot be hashed or direct readable
bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);
Configuration conf = context.getConf();
// If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
// the threshold size, convert the join into map-join and don't create a conditional task
boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
int bigTablePosition = -1;
if (convertJoinMapJoin) {
// This is the threshold that the user has specified to fit in mapjoin
long mapJoinSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
Long bigTableSize = null;
Set<String> aliases = aliasToWork.keySet();
for (int tablePosition : bigTableCandidates) {
Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
// some small alias is not known or too big
continue;
}
if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
// prefer right most alias
continue;
}
long aliasSize = Utilities.sumOf(aliasToSize, participants);
if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
bigTablePosition = tablePosition;
bigTableSize = aliasSize;
}
}
}
currWork.setLeftInputJoin(joinOp.getConf().isLeftInputJoin());
currWork.setBaseSrc(joinOp.getConf().getBaseSrc());
currWork.setMapAliases(joinOp.getConf().getMapAliases());
if (bigTablePosition >= 0) {
// create map join task and set big table as bigTablePosition
MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);
newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
newTask.setFetchSource(currTask.isFetchSource());
replaceTask(currTask, newTask);
// joined with multiple small tables on different keys
if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
}
return newTask;
}
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
// this table cannot be big table
if (!bigTableCandidates.contains(pos)) {
continue;
}
Operator<?> startOp = joinOp.getParentOperators().get(pos);
Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
continue;
}
MapredWork newWork = SerializationUtilities.clonePlan(currTask.getWork());
// create map join task and set big table as i
MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
newTask.setFetchSource(currTask.isFetchSource());
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage(), e);
}
if (listTasks.isEmpty()) {
return currTask;
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.setLeftInputJoin(false);
currWork.setBaseSrc(null);
currWork.setMapAliases(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
// set resolver and resolver context
cndTsk.setResolver(new ConditionalResolverCommonJoin());
ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
resolverCtx.setPathToAliases(pathToAliases);
resolverCtx.setAliasToKnownSize(aliasToSize);
resolverCtx.setTaskToAliases(taskToAliases);
resolverCtx.setCommonJoinTask(currTask);
resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
cndTsk.setResolverCtx(resolverCtx);
// replace the current task with the new generated conditional task
replaceTaskWithConditionalTask(currTask, cndTsk);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class CommonJoinTaskDispatcher method mergeMapJoinTaskIntoItsChildMapRedTask.
/*
* A task and its child task has been converted from join to mapjoin.
* See if the two tasks can be merged.
*/
private void mergeMapJoinTaskIntoItsChildMapRedTask(MapRedTask mapJoinTask, Configuration conf) throws SemanticException {
// If so, check if we can merge mapJoinTask into that child.
if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) {
// child-tasks in which case we don't want to do anything.
return;
}
Task<?> childTask = mapJoinTask.getChildTasks().get(0);
if (!(childTask instanceof MapRedTask)) {
// Nothing to do if it is not a MapReduce task.
return;
}
MapRedTask childMapRedTask = (MapRedTask) childTask;
MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
MapWork childMapWork = childMapRedTask.getWork().getMapWork();
Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork = mapJoinMapWork.getAliasToWork();
if (mapJoinAliasToWork.size() > 1) {
// Do not merge if the MapredWork of MapJoin has multiple input aliases.
return;
}
Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry = mapJoinAliasToWork.entrySet().iterator().next();
String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
TableScanOperator mapJoinTaskTableScanOperator = OperatorUtils.findSingleOperator(mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
if (mapJoinTaskTableScanOperator == null) {
throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + mapJoinAlias + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
}
Set<FileSinkOperator> mapJoinTaskFileSinkOperators = OperatorUtils.findOperators(mapJoinTaskTableScanOperator, FileSinkOperator.class);
if (mapJoinTaskFileSinkOperators.isEmpty()) {
throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + " operator at the last operator of the MapJoin Task.");
}
if (mapJoinTaskFileSinkOperators.size() > 1) {
LOG.warn("Multiple " + FileSinkOperator.getOperatorName() + " operators found at the last operator of the MapJoin Task.");
return;
}
// The mapJoinTaskFileSinkOperator writes to a different directory
FileSinkOperator mapJoinTaskFileSinkOperator = mapJoinTaskFileSinkOperators.iterator().next();
Path childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
if (childMRAliases == null || childMRAliases.size() != 1) {
return;
}
String childMRAlias = childMRAliases.get(0);
// Sanity check to make sure there is no alias conflict after merge.
for (Entry<Path, List<String>> entry : childMapWork.getPathToAliases().entrySet()) {
Path path = entry.getKey();
List<String> aliases = entry.getValue();
if (path.equals(childMRPath)) {
continue;
}
if (aliases.contains(mapJoinAlias)) {
// alias confict should not happen here.
return;
}
}
MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();
if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
// We should relax this constraint with a follow-up jira.
return;
}
// is under the limit.
if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)) {
// Do not merge.
return;
}
TableScanOperator childMRTaskTableScanOperator = OperatorUtils.findSingleOperator(childMapWork.getAliasToWork().get(childMRAlias.toString()), TableScanOperator.class);
if (childMRTaskTableScanOperator == null) {
throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + childMRAlias + ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
}
List<Operator<? extends OperatorDesc>> parentsInMapJoinTask = mapJoinTaskFileSinkOperator.getParentOperators();
List<Operator<? extends OperatorDesc>> childrenInChildMRTask = childMRTaskTableScanOperator.getChildOperators();
if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
// Do not merge if we do not know how to connect two operator trees.
return;
}
// Step 2: Merge mapJoinTask into the Map-side of its child.
// Step 2.1: Connect the operator trees of two MapRedTasks.
Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
// Step 2.2: Replace the corresponding part childMRWork's MapWork.
GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias.toString(), mapJoinMapWork, childMapWork);
// Step 2.3: Fill up stuff in local work
if (mapJoinLocalWork != null) {
if (childLocalWork == null) {
childMapWork.setMapRedLocalWork(mapJoinLocalWork);
} else {
childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
}
}
// Step 2.4: Remove this MapJoin task
List<Task<?>> parentTasks = mapJoinTask.getParentTasks();
mapJoinTask.setParentTasks(null);
mapJoinTask.setChildTasks(null);
childMapRedTask.getParentTasks().remove(mapJoinTask);
if (parentTasks != null) {
childMapRedTask.getParentTasks().addAll(parentTasks);
for (Task<?> parentTask : parentTasks) {
parentTask.getChildTasks().remove(mapJoinTask);
if (!parentTask.getChildTasks().contains(childMapRedTask)) {
parentTask.getChildTasks().add(childMapRedTask);
}
}
} else {
if (physicalContext.getRootTasks().contains(mapJoinTask)) {
physicalContext.removeFromRootTask(mapJoinTask);
if (childMapRedTask.getParentTasks() != null && childMapRedTask.getParentTasks().size() == 0 && !physicalContext.getRootTasks().contains(childMapRedTask)) {
physicalContext.addToRootTask(childMapRedTask);
}
}
}
if (childMapRedTask.getParentTasks().size() == 0) {
childMapRedTask.setParentTasks(null);
}
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class TestExecDriver method executePlan.
private void executePlan() throws Exception {
String testName = new Exception().getStackTrace()[1].getMethodName();
MapRedTask mrtask = new MapRedTask();
TaskQueue taskQueue = new TaskQueue();
mrtask.setWork(mr);
mrtask.initialize(queryState, null, taskQueue, null);
int exitVal = mrtask.execute();
if (exitVal != 0) {
LOG.error(testName + " execution failed with exit status: " + exitVal);
assertEquals(true, false);
}
LOG.info(testName + " execution completed successfully");
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class GenMapRedUtils method joinPlan.
/**
* Merge the current task into the old task for the reducer
*
* @param currTask
* the current task for the current reducer
* @param oldTask
* the old task for the current reducer
* @param opProcCtx
* processing context
*/
public static void joinPlan(Task<?> currTask, Task<?> oldTask, GenMRProcContext opProcCtx) throws SemanticException {
assert currTask != null && oldTask != null;
TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
List<Task<?>> parTasks = null;
// terminate the old task and make current task dependent on it
if (currTask.getParentTasks() != null && !currTask.getParentTasks().isEmpty()) {
parTasks = new ArrayList<Task<?>>();
parTasks.addAll(currTask.getParentTasks());
Object[] parTaskArr = parTasks.toArray();
for (Object element : parTaskArr) {
((Task<?>) element).removeDependentTask(currTask);
}
}
if (currTopOp != null) {
mergeInput(currTopOp, opProcCtx, oldTask, false);
}
if (parTasks != null) {
for (Task<?> parTask : parTasks) {
parTask.addDependentTask(oldTask);
}
}
if (oldTask instanceof MapRedTask && currTask instanceof MapRedTask) {
((MapRedTask) currTask).getWork().getMapWork().mergingInto(((MapRedTask) oldTask).getWork().getMapWork());
}
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrTask(oldTask);
}
Aggregations