Search in sources :

Example 6 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class TestMapJoinOperator method executeTestImplementation.

private void executeTestImplementation(MapJoinTestImplementation mapJoinImplementation, MapJoinTestDescription testDesc, MapJoinTestData testData, RowTestObjectsMultiSet expectedTestRowMultiSet) throws Exception {
    System.out.println("*BENCHMARK* Starting " + mapJoinImplementation + " test");
    // UNDONE: Parameterize for implementation variation?
    MapJoinDesc mapJoinDesc = MapJoinTestConfig.createMapJoinDesc(testDesc);
    final boolean isVectorOutput = isVectorOutput(mapJoinImplementation);
    RowTestObjectsMultiSet outputTestRowMultiSet = new RowTestObjectsMultiSet();
    Operator<? extends OperatorDesc> testCollectorOperator = (!isVectorOutput ? new TestMultiSetCollectorOperator(testDesc.outputObjectInspectors, outputTestRowMultiSet) : new TestMultiSetVectorCollectorOperator(testDesc.outputTypeInfos, testDesc.outputObjectInspectors, outputTestRowMultiSet));
    MapJoinOperator operator = MapJoinTestConfig.createMapJoinImplementation(mapJoinImplementation, testDesc, testCollectorOperator, testData, mapJoinDesc);
    if (!isVectorOutput) {
        MapJoinTestData.driveBigTableData(testDesc, testData, operator);
    } else {
        MapJoinTestData.driveVectorBigTableData(testDesc, testData, operator);
    }
    System.out.println("*BENCHMARK* executeTestImplementation row count " + ((CountCollectorTestOperator) testCollectorOperator).getRowCount());
    // Verify the output!
    if (!expectedTestRowMultiSet.verify(outputTestRowMultiSet)) {
        System.out.println("*BENCHMARK* verify failed for " + mapJoinImplementation);
    } else {
        System.out.println("*BENCHMARK* verify succeeded for " + mapJoinImplementation);
    }
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) VectorMapJoinOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) VectorMapJoinDesc(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc) CountCollectorTestOperator(org.apache.hadoop.hive.ql.exec.util.collectoroperator.CountCollectorTestOperator) RowTestObjectsMultiSet(org.apache.hadoop.hive.ql.exec.util.rowobjects.RowTestObjectsMultiSet)

Example 7 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class TezCompiler method findParallelSemiJoinBranch.

private boolean findParallelSemiJoinBranch(Operator<?> mapjoin, TableScanOperator bigTableTS, ParseContext parseContext, Map<ReduceSinkOperator, TableScanOperator> semijoins) {
    boolean parallelEdges = false;
    for (Operator<?> op : mapjoin.getParentOperators()) {
        if (!(op instanceof ReduceSinkOperator)) {
            continue;
        }
        op = op.getParentOperators().get(0);
        // Follow the Reducesink operator upstream which is on small table side.
        while (!(op instanceof ReduceSinkOperator) && !(op instanceof TableScanOperator) && !(op.getChildren() != null && op.getChildren().size() > 1)) {
            if (op instanceof MapJoinOperator) {
                // ReduceSink, that is what we are looking for.
                for (Operator<?> parentOp : op.getParentOperators()) {
                    if (parentOp instanceof ReduceSinkOperator) {
                        continue;
                    }
                    // parent in current pipeline
                    op = parentOp;
                    continue;
                }
            }
            op = op.getParentOperators().get(0);
        }
        // Bail out if RS or TS is encountered.
        if (op instanceof ReduceSinkOperator || op instanceof TableScanOperator) {
            continue;
        }
        // A branch is hit.
        for (Node nd : op.getChildren()) {
            if (nd instanceof SelectOperator) {
                Operator<?> child = (Operator<?>) nd;
                while (child.getChildOperators().size() > 0) {
                    child = child.getChildOperators().get(0);
                }
                // If not ReduceSink Op, skip
                if (!(child instanceof ReduceSinkOperator)) {
                    // This still could be DPP.
                    if (child instanceof AppMasterEventOperator && ((AppMasterEventOperator) child).getConf() instanceof DynamicPruningEventDesc) {
                        // DPP indeed, Set parallel edges true
                        parallelEdges = true;
                    }
                    continue;
                }
                ReduceSinkOperator rs = (ReduceSinkOperator) child;
                SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(rs);
                if (sjInfo == null) {
                    continue;
                }
                TableScanOperator ts = sjInfo.getTsOp();
                if (ts != bigTableTS) {
                    // skip, not the one we are looking for.
                    continue;
                }
                parallelEdges = true;
                if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
                    // Created by hint, skip it
                    continue;
                }
                // Add the semijoin branch to the map
                semijoins.put(rs, ts);
            }
        }
    }
    return parallelEdges;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Node(org.apache.hadoop.hive.ql.lib.Node) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 8 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class TezCompiler method removeSemijoinsParallelToMapJoin.

/*
   *  The algorithm looks at all the mapjoins in the operator pipeline until
   *  it hits RS Op and for each mapjoin examines if it has paralllel semijoin
   *  edge or dynamic partition pruning.
   */
private void removeSemijoinsParallelToMapJoin(OptimizeTezProcContext procCtx) throws SemanticException {
    if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || !procCtx.conf.getBoolVar(ConfVars.HIVECONVERTJOIN) || procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_MAPJOIN)) {
        // are enabled for parallel mapjoins.
        return;
    }
    // Get all the TS ops.
    List<Operator<?>> topOps = new ArrayList<>();
    topOps.addAll(procCtx.parseContext.getTopOps().values());
    Map<ReduceSinkOperator, TableScanOperator> semijoins = new HashMap<>();
    for (Operator<?> parent : topOps) {
        // A TS can have multiple branches due to DPP Or Semijoin Opt.
        // USe DFS to traverse all the branches until RS is hit.
        Deque<Operator<?>> deque = new LinkedList<>();
        deque.add(parent);
        while (!deque.isEmpty()) {
            Operator<?> op = deque.pollLast();
            if (op instanceof ReduceSinkOperator) {
                // Done with this branch
                continue;
            }
            if (op instanceof MapJoinOperator) {
                // A candidate.
                if (!findParallelSemiJoinBranch(op, (TableScanOperator) parent, procCtx.parseContext, semijoins)) {
                    // no need to go down further, skip this TS operator pipeline.
                    break;
                }
            }
            deque.addAll(op.getChildOperators());
        }
    }
    if (semijoins.size() > 0) {
        for (ReduceSinkOperator rs : semijoins.keySet()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Semijoin optimization with parallel edge to map join. Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semijoins.get(rs)));
            }
            GenTezUtils.removeBranch(rs);
            GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, semijoins.get(rs));
        }
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Example 9 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class SortMergeJoinTaskDispatcher method convertSMBTaskToMapJoinTask.

// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork, int bigTablePosition, SMBMapJoinOperator smbJoinOp) throws UnsupportedEncodingException, SemanticException {
    // deep copy a new mapred work
    MapredWork newWork = SerializationUtilities.clonePlan(origWork);
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork);
    // generate the map join operator; already checked the map join
    MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
    // The reducer needs to be restored - Consider a query like:
    // select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
    // The reducer contains a groupby, which needs to be restored.
    ReduceWork rWork = newWork.getReduceWork();
    // create the local work for this plan
    MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
    // restore the reducer
    newWork.setReduceWork(rWork);
    return newTask;
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 10 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class SparkMapJoinOptimizer method convertJoinMapJoin.

/*
   * Once we have decided on the map join, the tree would transform from
   *
   *        |                   |
   *       Join               MapJoin
   *       / \                /   \
   *     RS   RS   --->     RS    TS (big table)
   *    /      \           /
   *   TS       TS        TS (small table)
   *
   * for spark.
   */
public MapJoinOperator convertJoinMapJoin(JoinOperator joinOp, OptimizeSparkProcContext context, int bigTablePosition) throws SemanticException {
    // of the constituent reduce sinks.
    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
        if (parentOp instanceof MuxOperator) {
            return null;
        }
    }
    // can safely convert the join to a map join.
    MapJoinOperator mapJoinOp = MapJoinProcessor.convertJoinOpMapJoinOp(context.getConf(), joinOp, joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp.getConf().getMapAliases(), bigTablePosition, true);
    Operator<? extends OperatorDesc> parentBigTableOp = mapJoinOp.getParentOperators().get(bigTablePosition);
    if (parentBigTableOp instanceof ReduceSinkOperator) {
        for (Operator<?> parentOp : parentBigTableOp.getParentOperators()) {
            // we might have generated a dynamic partition operator chain. Since
            // we're removing the reduce sink we need do remove that too.
            Set<SparkPartitionPruningSinkOperator> partitionPruningSinkOps = new HashSet<>();
            for (Operator<?> childOp : parentOp.getChildOperators()) {
                SparkPartitionPruningSinkOperator partitionPruningSinkOp = findPartitionPruningSinkOperator(childOp);
                if (partitionPruningSinkOp != null) {
                    partitionPruningSinkOps.add(partitionPruningSinkOp);
                }
            }
            for (SparkPartitionPruningSinkOperator partitionPruningSinkOp : partitionPruningSinkOps) {
                OperatorUtils.removeBranch(partitionPruningSinkOp);
                // at this point we've found the fork in the op pipeline that has the pruning as a child plan.
                LOG.info("Disabling dynamic pruning for: " + (partitionPruningSinkOp.getConf()).getTableScanNames() + ". Need to be removed together with reduce sink");
            }
        }
        mapJoinOp.getParentOperators().remove(bigTablePosition);
        if (!(mapJoinOp.getParentOperators().contains(parentBigTableOp.getParentOperators().get(0)))) {
            mapJoinOp.getParentOperators().add(bigTablePosition, parentBigTableOp.getParentOperators().get(0));
        }
        parentBigTableOp.getParentOperators().get(0).removeChild(parentBigTableOp);
        for (Operator<? extends OperatorDesc> op : mapJoinOp.getParentOperators()) {
            if (!(op.getChildOperators().contains(mapJoinOp))) {
                op.getChildOperators().add(mapJoinOp);
            }
            op.getChildOperators().remove(joinOp);
        }
    }
    // Data structures
    mapJoinOp.getConf().setQBJoinTreeProps(joinOp.getConf());
    return mapJoinOp;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MuxOperator(org.apache.hadoop.hive.ql.exec.MuxOperator) SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) HashSet(java.util.HashSet)

Aggregations

MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)41 Operator (org.apache.hadoop.hive.ql.exec.Operator)22 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)22 ArrayList (java.util.ArrayList)19 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)18 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)15 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 MapJoinDesc (org.apache.hadoop.hive.ql.plan.MapJoinDesc)12 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)11 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)9 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)9 HashMap (java.util.HashMap)8 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)8 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)8 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)8 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)8 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)8 List (java.util.List)7