use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class CommonJoinTaskDispatcher method processCurrentTask.
@Override
public Task<? extends Serializable> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
// whether it contains common join op; if contains, return this common join op
JoinOperator joinOp = getJoinOp(currTask);
if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
return null;
}
currTask.setTaskTag(Task.COMMON_JOIN);
MapWork currWork = currTask.getWork().getMapWork();
// create conditional work list and task list
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
// create task to aliases mapping and alias to input file mapping for resolver
// Must be deterministic order map for consistent q-test output across Java versions
HashMap<Task<? extends Serializable>, Set<String>> taskToAliases = new LinkedHashMap<Task<? extends Serializable>, Set<String>>();
HashMap<Path, ArrayList<String>> pathToAliases = currWork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();
// get parseCtx for this Join Operator
ParseContext parseCtx = physicalContext.getParseContext();
// start to generate multiple map join tasks
JoinDesc joinDesc = joinOp.getConf();
if (aliasToSize == null) {
aliasToSize = new HashMap<String, Long>();
}
try {
long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
// no table could be the big table; there is no need to convert
if (bigTableCandidates.isEmpty()) {
return null;
}
// if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
// only contain multi-sourced because multi-sourced cannot be hashed or direct readable
bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);
Configuration conf = context.getConf();
// If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
// the threshold size, convert the join into map-join and don't create a conditional task
boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
int bigTablePosition = -1;
if (convertJoinMapJoin) {
// This is the threshold that the user has specified to fit in mapjoin
long mapJoinSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
Long bigTableSize = null;
Set<String> aliases = aliasToWork.keySet();
for (int tablePosition : bigTableCandidates) {
Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
// some small alias is not known or too big
continue;
}
if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
// prefer right most alias
continue;
}
long aliasSize = Utilities.sumOf(aliasToSize, participants);
if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
bigTablePosition = tablePosition;
bigTableSize = aliasSize;
}
}
}
currWork.setLeftInputJoin(joinOp.getConf().isLeftInputJoin());
currWork.setBaseSrc(joinOp.getConf().getBaseSrc());
currWork.setMapAliases(joinOp.getConf().getMapAliases());
if (bigTablePosition >= 0) {
// create map join task and set big table as bigTablePosition
MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);
newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
newTask.setFetchSource(currTask.isFetchSource());
replaceTask(currTask, newTask);
// joined with multiple small tables on different keys
if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
}
return newTask;
}
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
// this table cannot be big table
if (!bigTableCandidates.contains(pos)) {
continue;
}
Operator<?> startOp = joinOp.getParentOperators().get(pos);
Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
continue;
}
MapredWork newWork = SerializationUtilities.clonePlan(currTask.getWork());
// create map join task and set big table as i
MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
newTask.setFetchSource(currTask.isFetchSource());
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage(), e);
}
if (listTasks.isEmpty()) {
return currTask;
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.setLeftInputJoin(false);
currWork.setBaseSrc(null);
currWork.setMapAliases(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
cndTsk.setListTasks(listTasks);
// set resolver and resolver context
cndTsk.setResolver(new ConditionalResolverCommonJoin());
ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
resolverCtx.setPathToAliases(pathToAliases);
resolverCtx.setAliasToKnownSize(aliasToSize);
resolverCtx.setTaskToAliases(taskToAliases);
resolverCtx.setCommonJoinTask(currTask);
resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
cndTsk.setResolverCtx(resolverCtx);
// replace the current task with the new generated conditional task
replaceTaskWithConditionalTask(currTask, cndTsk);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class SamplingOptimizer method resolve.
public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
for (Task<?> task : pctx.getRootTasks()) {
if (!(task instanceof MapRedTask) || !((MapRedTask) task).getWork().isFinalMapRed()) {
// this could be replaced by bucketing on RS + bucketed fetcher for next MR
continue;
}
MapredWork mrWork = ((MapRedTask) task).getWork();
MapWork mapWork = mrWork.getMapWork();
ReduceWork reduceWork = mrWork.getReduceWork();
if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) {
continue;
}
// GROUPBY operator in reducer may not be processed in parallel. Skip optimizing.
if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) {
continue;
}
Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
if (!(operator instanceof TableScanOperator)) {
continue;
}
ReduceSinkOperator child = OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
if (child == null || child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
continue;
}
child.getConf().setNumReducers(-1);
reduceWork.setNumReduceTasks(-1);
mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
}
return pctx;
}
use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.
the class SortMergeJoinTaskDispatcher method convertSMBWorkToJoinWork.
/*
* Convert the work containing to sort-merge join into a work, as if it had a regular join.
* Note that the operator tree is not changed - is still contains the SMB join, but the
* plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
*/
private MapredWork convertSMBWorkToJoinWork(MapredWork currWork, SMBMapJoinOperator oldSMBJoinOp) throws SemanticException {
try {
// deep copy a new mapred work
MapredWork currJoinWork = SerializationUtilities.clonePlan(currWork);
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
// change the newly created map-red plan as if it was a join operator
genSMBJoinWork(currJoinWork.getMapWork(), newSMBJoinOp);
return currJoinWork;
} catch (Exception e) {
e.printStackTrace();
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
}
}
Aggregations