Search in sources :

Example 1 with TerminalOperator

use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.

the class SetSparkReducerParallelism method needSetParallelism.

// tests whether the RS needs automatic setting parallelism
private boolean needSetParallelism(ReduceSinkOperator reduceSink, HiveConf hiveConf) {
    ReduceSinkDesc desc = reduceSink.getConf();
    if (desc.getNumReducers() <= 0) {
        return true;
    }
    if (desc.getNumReducers() == 1 && desc.hasOrderBy() && hiveConf.getBoolVar(HiveConf.ConfVars.HIVESAMPLINGFORORDERBY) && !desc.isDeduplicated()) {
        Stack<Operator<? extends OperatorDesc>> descendants = new Stack<Operator<? extends OperatorDesc>>();
        List<Operator<? extends OperatorDesc>> children = reduceSink.getChildOperators();
        if (children != null) {
            for (Operator<? extends OperatorDesc> child : children) {
                descendants.push(child);
            }
        }
        while (descendants.size() != 0) {
            Operator<? extends OperatorDesc> descendant = descendants.pop();
            // If the decendants contains LimitOperator,return false
            if (descendant instanceof LimitOperator) {
                return false;
            }
            boolean reachTerminalOperator = (descendant instanceof TerminalOperator);
            if (!reachTerminalOperator) {
                List<Operator<? extends OperatorDesc>> childrenOfDescendant = descendant.getChildOperators();
                if (childrenOfDescendant != null) {
                    for (Operator<? extends OperatorDesc> childOfDescendant : childrenOfDescendant) {
                        descendants.push(childOfDescendant);
                    }
                }
            }
        }
        return true;
    }
    return false;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Stack(java.util.Stack)

Example 2 with TerminalOperator

use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.

the class TezCompiler method markSemiJoinForDPP.

private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
    // Stores the Tablescan operators processed to avoid redoing them.
    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
    for (ReduceSinkOperator rs : map.keySet()) {
        SemiJoinBranchInfo sjInfo = map.get(rs);
        TableScanOperator ts = sjInfo.getTsOp();
        if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
            continue;
        }
        // A TS can have multiple branches due to DPP Or Semijoin Opt.
        // Use DFS to traverse all the branches until RS or DPP is hit.
        Deque<Operator<?>> deque = new LinkedList<>();
        deque.add(ts);
        while (!deque.isEmpty()) {
            Operator<?> op = deque.pollLast();
            if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
                // DPP. Now look up nDVs on both sides to see the selectivity.
                // <Parent Ops>-SEL-GB1-RS1-GB2-RS2
                SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
                try {
                    // Get nDVs on Semijoin edge side
                    Statistics stats = selOp.getStatistics();
                    if (stats == null) {
                        // No stats found on semijoin edge, do nothing
                        break;
                    }
                    String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
                    ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
                    if (colStatisticsSJ == null) {
                        // No column stats found for semijoin edge
                        break;
                    }
                    long nDVs = colStatisticsSJ.getCountDistint();
                    if (nDVs > 0) {
                        // Lookup nDVs on TS side.
                        RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
                        // TODO Handle multi column semi-joins as part of HIVE-23934
                        ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
                        FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
                        Statistics filStats = fil.getStatistics();
                        if (filStats == null) {
                            // No stats found on target, do nothing
                            break;
                        }
                        String colName = ExprNodeDescUtils.extractColName(tsExpr);
                        ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
                        if (colStatisticsTarget == null) {
                            // No column stats found on target
                            break;
                        }
                        long nDVsOfTS = colStatisticsTarget.getCountDistint();
                        double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
                        if ((long) nDVsOfTSFactored > nDVs) {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                            }
                            sjInfo.setShouldRemove(false);
                        }
                    }
                } catch (NullPointerException e) {
                    // Do nothing
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                    }
                }
                break;
            }
            if (op instanceof TerminalOperator) {
                // Done with this branch
                continue;
            }
            deque.addAll(op.getChildOperators());
        }
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) LinkedList(java.util.LinkedList) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 3 with TerminalOperator

use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.

the class TezCompiler method connect.

private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components, ParseContext parseContext) {
    indexes.put(o, index.get());
    lowLinks.put(o, index.get());
    index.incrementAndGet();
    nodes.push(o);
    List<Operator<?>> children;
    if (o instanceof AppMasterEventOperator) {
        children = new ArrayList<>((o.getChildOperators()));
        TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan();
        LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
        children.add(ts);
    } else if (o instanceof TerminalOperator) {
        children = new ArrayList<>((o.getChildOperators()));
        for (ReduceSinkOperator rs : parseContext.getTerminalOpToRSMap().get((TerminalOperator<?>) o)) {
            // add an edge
            LOG.debug("Adding special edge: From terminal op to semijoin edge " + o.getName() + " --> " + rs.toString());
            children.add(rs);
        }
        if (o instanceof ReduceSinkOperator) {
            // semijoin case
            SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
            if (sjInfo != null) {
                TableScanOperator ts = sjInfo.getTsOp();
                LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
                children.add(ts);
            }
        }
    } else {
        children = o.getChildOperators();
    }
    for (Operator<?> child : children) {
        if (!indexes.containsKey(child)) {
            connect(child, index, nodes, indexes, lowLinks, components, parseContext);
            lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
        } else if (nodes.contains(child)) {
            lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
        }
    }
    if (lowLinks.get(o).equals(indexes.get(o))) {
        Set<Operator<?>> component = new LinkedHashSet<Operator<?>>();
        components.add(component);
        Operator<?> current;
        do {
            current = nodes.pop();
            component.add(current);
        } while (current != o);
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) LinkedHashSet(java.util.LinkedHashSet) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ArrayList(java.util.ArrayList) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)

Example 4 with TerminalOperator

use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.

the class TezCompiler method connectTerminalOps.

private void connectTerminalOps(ParseContext pCtx) {
    // The map which contains the virtual edges from non-semijoin terminal ops to semjoin RSs.
    Multimap<TerminalOperator<?>, ReduceSinkOperator> terminalOpToRSMap = ArrayListMultimap.create();
    // Map of semijoin RS to work ops to ensure no work is examined more than once.
    Map<ReduceSinkOperator, TerminalOpsInfo> rsToTerminalOpsInfo = new HashMap<>();
    // Get all the terminal ops
    for (ReduceSinkOperator rs : pCtx.getRsToSemiJoinBranchInfo().keySet()) {
        TerminalOpsInfo terminalOpsInfo = rsToTerminalOpsInfo.get(rs);
        if (terminalOpsInfo != null) {
            // done with this one
            continue;
        }
        Set<ReduceSinkOperator> workRSOps = new HashSet<>();
        Set<TerminalOperator<?>> workTerminalOps = new HashSet<>();
        // Get the SEL Op in the semijoin-branch, SEL->GBY1->RS1->GBY2->RS2
        SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
        OperatorUtils.findWorkOperatorsAndSemiJoinEdges(selOp, pCtx.getRsToSemiJoinBranchInfo(), workRSOps, workTerminalOps);
        TerminalOpsInfo candidate = new TerminalOpsInfo(workTerminalOps);
        // A work may contain multiple semijoin edges, traverse rsOps and add for each
        for (ReduceSinkOperator rsFound : workRSOps) {
            rsToTerminalOpsInfo.put(rsFound, candidate);
            for (TerminalOperator<?> terminalOp : candidate.terminalOps) {
                terminalOpToRSMap.put(terminalOp, rsFound);
            }
        }
    }
    pCtx.setTerminalOpToRSMap(terminalOpToRSMap);
}
Also used : SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) LinkedHashMap(java.util.LinkedHashMap) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) LinkedHashSet(java.util.LinkedHashSet) HashSet(java.util.HashSet)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)4 TerminalOperator (org.apache.hadoop.hive.ql.exec.TerminalOperator)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)3 Operator (org.apache.hadoop.hive.ql.exec.Operator)3 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 LinkedHashSet (java.util.LinkedHashSet)2 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)2 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)2 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)2 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)2 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)2 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 TezDummyStoreOperator (org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator)2 TopNKeyOperator (org.apache.hadoop.hive.ql.exec.TopNKeyOperator)2 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)2 DynamicPruningEventDesc (org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1