use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SortMergeJoinTaskDispatcher method getSMBMapJoinOp.
private SMBMapJoinOperator getSMBMapJoinOp(Operator<? extends OperatorDesc> currOp, Operator<? extends OperatorDesc> reducer) {
SMBMapJoinOperator ret = null;
while (true) {
if (currOp instanceof SMBMapJoinOperator) {
if (ret != null) {
return null;
}
ret = (SMBMapJoinOperator) currOp;
}
// Does any operator in the tree stop the task from being converted to a conditional task
if (!currOp.opAllowedBeforeSortMergeJoin()) {
return null;
}
List<Operator<? extends OperatorDesc>> childOps = currOp.getChildOperators();
if ((childOps == null) || (childOps.isEmpty())) {
return reducerAllowedSMBJoinOp(reducer) ? ret : null;
}
// multi-table inserts not supported
if (childOps.size() > 1) {
return null;
}
currOp = childOps.get(0);
}
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SortMergeJoinTaskDispatcher method convertSMBTaskToMapJoinTask.
// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork, int bigTablePosition, SMBMapJoinOperator smbJoinOp) throws SemanticException {
// deep copy a new mapred work
MapredWork newWork = SerializationUtilities.clonePlan(origWork);
// create a mapred task for this work
MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork);
// generate the map join operator; already checked the map join
MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
// The reducer needs to be restored - Consider a query like:
// select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
// The reducer contains a groupby, which needs to be restored.
ReduceWork rWork = newWork.getReduceWork();
// create the local work for this plan
MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
// restore the reducer
newWork.setReduceWork(rWork);
return newTask;
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SortMergeJoinTaskDispatcher method processCurrentTask.
@Override
public Task<?> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
// whether it contains a sort merge join operator
MapredWork currWork = currTask.getWork();
SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
if (!isEligibleForOptimization(originalSMBJoinOp)) {
return null;
}
currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
// Convert the work containing to sort-merge join into a work, as if it had a regular join.
// Note that the operator tree is not changed - is still contains the SMB join, but the
// plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
// This is used to convert the plan to a map-join, and then the original SMB join plan is used
// as a backup task.
MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
currWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
currJoinWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currJoinWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currJoinWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
// create conditional work list and task list
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<?>> listTasks = new ArrayList<Task<?>>();
// create task to aliases mapping and alias to input file mapping for resolver
// Must be deterministic order map for consistent q-test output across Java versions
HashMap<Task<?>, Set<String>> taskToAliases = new LinkedHashMap<Task<?>, Set<String>>();
// Note that pathToAlias will behave as if the original plan was a join plan
Map<Path, List<String>> pathToAliases = currJoinWork.getMapWork().getPathToAliases();
// generate a map join task for the big table
SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
Byte[] order = originalSMBJoinDesc.getTagOrder();
int numAliases = order.length;
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
HashMap<String, Long> aliasToSize = new HashMap<String, Long>();
Configuration conf = context.getConf();
try {
long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currJoinWork.getMapWork(), pathToAliases, aliasToSize);
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
// this table cannot be big table
if (!bigTableCandidates.contains(bigTablePosition)) {
continue;
}
// create map join task for the given big table position
MapRedTask newTask = convertSMBTaskToMapJoinTask(currJoinWork, bigTablePosition, newSMBJoinOp);
MapWork mapWork = newTask.getWork().getMapWork();
Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (aliasKnownSize > 0) {
long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
// this table is not good to be a big table.
continue;
}
}
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
newTask.setFetchSource(currTask.isFetchSource());
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
throw new SemanticException("Generate Map Join Task Error", e);
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.getMapWork().setLeftInputJoin(false);
currWork.getMapWork().setBaseSrc(null);
currWork.getMapWork().setMapAliases(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
// set resolver and resolver context
cndTsk.setResolver(new ConditionalResolverCommonJoin());
ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
resolverCtx.setPathToAliases(pathToAliases);
resolverCtx.setAliasToKnownSize(aliasToSize);
resolverCtx.setTaskToAliases(taskToAliases);
resolverCtx.setCommonJoinTask(currTask);
resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
cndTsk.setResolverCtx(resolverCtx);
// replace the current task with the new generated conditional task
replaceTaskWithConditionalTask(currTask, cndTsk);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SortMergeJoinTaskDispatcher method genSMBJoinWork.
// Convert the work in the SMB plan to a regular join
// Note that the operator tree is not fixed, only the path/alias mappings in the
// plan are fixed. The operator tree will still contain the SMBJoinOperator
private void genSMBJoinWork(MapWork currWork, SMBMapJoinOperator smbJoinOp) {
// Remove the paths which are not part of aliasToPartitionInfo
Map<String, PartitionDesc> aliasToPartitionInfo = currWork.getAliasToPartnInfo();
List<Path> removePaths = new ArrayList<>();
for (Map.Entry<Path, List<String>> entry : currWork.getPathToAliases().entrySet()) {
boolean keepPath = false;
for (String alias : entry.getValue()) {
if (aliasToPartitionInfo.containsKey(alias)) {
keepPath = true;
break;
}
}
// Remove if the path is not present
if (!keepPath) {
removePaths.add(entry.getKey());
}
}
List<String> removeAliases = new ArrayList<String>();
for (Path removePath : removePaths) {
removeAliases.addAll(currWork.getPathToAliases().get(removePath));
currWork.removePathToAlias(removePath);
currWork.removePathToPartitionInfo(removePath);
}
for (String alias : removeAliases) {
currWork.getAliasToPartnInfo().remove(alias);
currWork.getAliasToWork().remove(alias);
}
// Get the MapredLocalWork
MapredLocalWork localWork = smbJoinOp.getConf().getLocalWork();
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : localWork.getAliasToWork().entrySet()) {
String alias = entry.getKey();
Operator<? extends OperatorDesc> op = entry.getValue();
FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias);
// Add the entry in mapredwork
currWork.getAliasToWork().put(alias, op);
PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias);
if (fetchWork.getTblDir() != null) {
currWork.mergeAliasedInput(alias, fetchWork.getTblDir(), partitionInfo);
} else {
for (Path pathDir : fetchWork.getPartDir()) {
currWork.mergeAliasedInput(alias, pathDir, partitionInfo);
}
}
}
// Remove the dummy store operator from the tree
for (Operator<? extends OperatorDesc> parentOp : smbJoinOp.getParentOperators()) {
if (parentOp instanceof DummyStoreOperator) {
Operator<? extends OperatorDesc> grandParentOp = parentOp.getParentOperators().get(0);
smbJoinOp.replaceParent(parentOp, grandParentOp);
grandParentOp.setChildOperators(parentOp.getChildOperators());
parentOp.setParentOperators(null);
parentOp.setParentOperators(null);
}
}
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SparkSkewJoinProcFactory method splitTask.
/**
* If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
*/
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
SparkWork currentWork = currentTask.getWork();
Set<Operator<?>> reduceSinkSet = OperatorUtils.getOp(reduceWork, ReduceSinkOperator.class);
if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
// disconnect the reduce work from its child. this should produce two isolated sub graphs
currentWork.disconnect(reduceWork, childWork);
// move works following the current reduce work into a new spark work
SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
newWork.add(childWork);
copyWorkGraph(currentWork, newWork, childWork);
// remove them from current spark work
for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
currentWork.remove(baseWork);
currentWork.getCloneToWork().remove(baseWork);
}
// create TS to read intermediate data
Context baseCtx = parseContext.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
// this will insert FS and TS between the RS and its parent
TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
// create new MapWork
MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
newWork.add(mapWork);
newWork.connect(mapWork, childWork, originEdge);
// setup the new map work
String streamDesc = taskTmpDir.toUri().toString();
if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
String id = null;
if (childReducer instanceof JoinOperator) {
if (parseContext.getJoinOps().contains(childReducer)) {
id = ((JoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof MapJoinOperator) {
if (parseContext.getMapJoinOps().contains(childReducer)) {
id = ((MapJoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof SMBMapJoinOperator) {
if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
id = ((SMBMapJoinOperator) childReducer).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (mapWork.getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
}
GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
// insert the new task between current task and its child
@SuppressWarnings("unchecked") Task<?> newTask = TaskFactory.get(newWork);
List<Task<?>> childTasks = currentTask.getChildTasks();
// must have at most one child
if (childTasks != null && childTasks.size() > 0) {
Task<?> childTask = childTasks.get(0);
currentTask.removeDependentTask(childTask);
newTask.addDependentTask(childTask);
}
currentTask.addDependentTask(newTask);
newTask.setFetchSource(currentTask.isFetchSource());
}
}
Aggregations