use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class TestMapJoinOperator method executeTestImplementation.
private void executeTestImplementation(MapJoinTestImplementation mapJoinImplementation, MapJoinTestDescription testDesc, MapJoinTestData testData, RowTestObjectsMultiSet expectedTestRowMultiSet) throws Exception {
System.out.println("*BENCHMARK* Starting " + mapJoinImplementation + " test");
// UNDONE: Parameterize for implementation variation?
MapJoinDesc mapJoinDesc = MapJoinTestConfig.createMapJoinDesc(testDesc);
final boolean isVectorOutput = isVectorOutput(mapJoinImplementation);
RowTestObjectsMultiSet outputTestRowMultiSet = new RowTestObjectsMultiSet();
Operator<? extends OperatorDesc> testCollectorOperator = (!isVectorOutput ? new TestMultiSetCollectorOperator(testDesc.outputObjectInspectors, outputTestRowMultiSet) : new TestMultiSetVectorCollectorOperator(testDesc.outputTypeInfos, testDesc.outputObjectInspectors, outputTestRowMultiSet));
MapJoinOperator operator = MapJoinTestConfig.createMapJoinImplementation(mapJoinImplementation, testDesc, testCollectorOperator, testData, mapJoinDesc);
if (!isVectorOutput) {
MapJoinTestData.driveBigTableData(testDesc, testData, operator);
} else {
MapJoinTestData.driveVectorBigTableData(testDesc, testData, operator);
}
System.out.println("*BENCHMARK* executeTestImplementation row count " + ((CountCollectorTestOperator) testCollectorOperator).getRowCount());
// Verify the output!
if (!expectedTestRowMultiSet.verify(outputTestRowMultiSet)) {
System.out.println("*BENCHMARK* verify failed for " + mapJoinImplementation);
} else {
System.out.println("*BENCHMARK* verify succeeded for " + mapJoinImplementation);
}
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class TezCompiler method findParallelSemiJoinBranch.
private boolean findParallelSemiJoinBranch(Operator<?> mapjoin, TableScanOperator bigTableTS, ParseContext parseContext, Map<ReduceSinkOperator, TableScanOperator> semijoins) {
boolean parallelEdges = false;
for (Operator<?> op : mapjoin.getParentOperators()) {
if (!(op instanceof ReduceSinkOperator)) {
continue;
}
op = op.getParentOperators().get(0);
// Follow the Reducesink operator upstream which is on small table side.
while (!(op instanceof ReduceSinkOperator) && !(op instanceof TableScanOperator) && !(op.getChildren() != null && op.getChildren().size() > 1)) {
if (op instanceof MapJoinOperator) {
// ReduceSink, that is what we are looking for.
for (Operator<?> parentOp : op.getParentOperators()) {
if (parentOp instanceof ReduceSinkOperator) {
continue;
}
// parent in current pipeline
op = parentOp;
continue;
}
}
op = op.getParentOperators().get(0);
}
// Bail out if RS or TS is encountered.
if (op instanceof ReduceSinkOperator || op instanceof TableScanOperator) {
continue;
}
// A branch is hit.
for (Node nd : op.getChildren()) {
if (nd instanceof SelectOperator) {
Operator<?> child = (Operator<?>) nd;
while (child.getChildOperators().size() > 0) {
child = child.getChildOperators().get(0);
}
// If not ReduceSink Op, skip
if (!(child instanceof ReduceSinkOperator)) {
// This still could be DPP.
if (child instanceof AppMasterEventOperator && ((AppMasterEventOperator) child).getConf() instanceof DynamicPruningEventDesc) {
// DPP indeed, Set parallel edges true
parallelEdges = true;
}
continue;
}
ReduceSinkOperator rs = (ReduceSinkOperator) child;
SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(rs);
if (sjInfo == null) {
continue;
}
TableScanOperator ts = sjInfo.getTsOp();
if (ts != bigTableTS) {
// skip, not the one we are looking for.
continue;
}
parallelEdges = true;
if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
// Created by hint, skip it
continue;
}
// Add the semijoin branch to the map
semijoins.put(rs, ts);
}
}
}
return parallelEdges;
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class TezCompiler method removeSemijoinsParallelToMapJoin.
/*
* The algorithm looks at all the mapjoins in the operator pipeline until
* it hits RS Op and for each mapjoin examines if it has paralllel semijoin
* edge or dynamic partition pruning.
*/
private void removeSemijoinsParallelToMapJoin(OptimizeTezProcContext procCtx) throws SemanticException {
if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || !procCtx.conf.getBoolVar(ConfVars.HIVECONVERTJOIN) || procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_MAPJOIN)) {
// are enabled for parallel mapjoins.
return;
}
// Get all the TS ops.
List<Operator<?>> topOps = new ArrayList<>();
topOps.addAll(procCtx.parseContext.getTopOps().values());
Map<ReduceSinkOperator, TableScanOperator> semijoins = new HashMap<>();
for (Operator<?> parent : topOps) {
// A TS can have multiple branches due to DPP Or Semijoin Opt.
// USe DFS to traverse all the branches until RS is hit.
Deque<Operator<?>> deque = new LinkedList<>();
deque.add(parent);
while (!deque.isEmpty()) {
Operator<?> op = deque.pollLast();
if (op instanceof ReduceSinkOperator) {
// Done with this branch
continue;
}
if (op instanceof MapJoinOperator) {
// A candidate.
if (!findParallelSemiJoinBranch(op, (TableScanOperator) parent, procCtx.parseContext, semijoins)) {
// no need to go down further, skip this TS operator pipeline.
break;
}
}
deque.addAll(op.getChildOperators());
}
}
if (semijoins.size() > 0) {
for (ReduceSinkOperator rs : semijoins.keySet()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Semijoin optimization with parallel edge to map join. Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semijoins.get(rs)));
}
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, semijoins.get(rs));
}
}
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class SortMergeJoinTaskDispatcher method convertSMBTaskToMapJoinTask.
// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork, int bigTablePosition, SMBMapJoinOperator smbJoinOp) throws UnsupportedEncodingException, SemanticException {
// deep copy a new mapred work
MapredWork newWork = SerializationUtilities.clonePlan(origWork);
// create a mapred task for this work
MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork);
// generate the map join operator; already checked the map join
MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
// The reducer needs to be restored - Consider a query like:
// select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
// The reducer contains a groupby, which needs to be restored.
ReduceWork rWork = newWork.getReduceWork();
// create the local work for this plan
MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
// restore the reducer
newWork.setReduceWork(rWork);
return newTask;
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class SparkMapJoinOptimizer method convertJoinMapJoin.
/*
* Once we have decided on the map join, the tree would transform from
*
* | |
* Join MapJoin
* / \ / \
* RS RS ---> RS TS (big table)
* / \ /
* TS TS TS (small table)
*
* for spark.
*/
public MapJoinOperator convertJoinMapJoin(JoinOperator joinOp, OptimizeSparkProcContext context, int bigTablePosition) throws SemanticException {
// of the constituent reduce sinks.
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (parentOp instanceof MuxOperator) {
return null;
}
}
// can safely convert the join to a map join.
MapJoinOperator mapJoinOp = MapJoinProcessor.convertJoinOpMapJoinOp(context.getConf(), joinOp, joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp.getConf().getMapAliases(), bigTablePosition, true);
Operator<? extends OperatorDesc> parentBigTableOp = mapJoinOp.getParentOperators().get(bigTablePosition);
if (parentBigTableOp instanceof ReduceSinkOperator) {
for (Operator<?> parentOp : parentBigTableOp.getParentOperators()) {
// we might have generated a dynamic partition operator chain. Since
// we're removing the reduce sink we need do remove that too.
Set<SparkPartitionPruningSinkOperator> partitionPruningSinkOps = new HashSet<>();
for (Operator<?> childOp : parentOp.getChildOperators()) {
SparkPartitionPruningSinkOperator partitionPruningSinkOp = findPartitionPruningSinkOperator(childOp);
if (partitionPruningSinkOp != null) {
partitionPruningSinkOps.add(partitionPruningSinkOp);
}
}
for (SparkPartitionPruningSinkOperator partitionPruningSinkOp : partitionPruningSinkOps) {
OperatorUtils.removeBranch(partitionPruningSinkOp);
// at this point we've found the fork in the op pipeline that has the pruning as a child plan.
LOG.info("Disabling dynamic pruning for: " + (partitionPruningSinkOp.getConf()).getTableScanNames() + ". Need to be removed together with reduce sink");
}
}
mapJoinOp.getParentOperators().remove(bigTablePosition);
if (!(mapJoinOp.getParentOperators().contains(parentBigTableOp.getParentOperators().get(0)))) {
mapJoinOp.getParentOperators().add(bigTablePosition, parentBigTableOp.getParentOperators().get(0));
}
parentBigTableOp.getParentOperators().get(0).removeChild(parentBigTableOp);
for (Operator<? extends OperatorDesc> op : mapJoinOp.getParentOperators()) {
if (!(op.getChildOperators().contains(mapJoinOp))) {
op.getChildOperators().add(mapJoinOp);
}
op.getChildOperators().remove(joinOp);
}
}
// Data structures
mapJoinOp.getConf().setQBJoinTreeProps(joinOp.getConf());
return mapJoinOp;
}
Aggregations