Search in sources :

Example 6 with DynamicPruningEventDesc

use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.

the class SharedWorkOptimizer method sharedWorkOptimization.

/**
 * Class wrapping shared work optimizer.
 * This implementation enables merging of TS with different schemas by taking the union of the
 * {@link TableScanDesc#getNeededColumns()} and {@link TableScanDesc#getNeededColumnIDs()}
 * from both {@link TableScanOperator}s.
 */
public boolean sharedWorkOptimization(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, List<TableScanOperator> tableScans, Mode mode, boolean schemaMerge) throws SemanticException {
    // Boolean to keep track of whether this method actually merged any TS operators
    boolean mergedExecuted = false;
    Set<TableScanOperator> retainedScans = new LinkedHashSet<>();
    Set<Operator<?>> removedOps = new HashSet<>();
    for (TableScanOperator discardableTsOp : tableScans) {
        TableName tableName1 = discardableTsOp.getTableName();
        if (discardableTsOp.getNumChild() == 0) {
            removedOps.add(discardableTsOp);
        }
        if (removedOps.contains(discardableTsOp)) {
            LOG.debug("Skip {} as it has already been removed", discardableTsOp);
            continue;
        }
        for (TableScanOperator retainableTsOp : retainedScans) {
            if (optimizerCache.getWorkGroup(discardableTsOp).contains(retainableTsOp)) {
                LOG.trace("No need check further {} and {} are in the same group", discardableTsOp, retainableTsOp);
                continue;
            }
            if (removedOps.contains(retainableTsOp)) {
                LOG.debug("Skip {} as it has already been removed", retainableTsOp);
                continue;
            }
            LOG.debug("Can we merge {} into {} to remove a scan on {}?", discardableTsOp, retainableTsOp, tableName1);
            SharedResult sr;
            // If Iceberg metadata tables are in the query, disable this optimisation.
            String metaTable1 = retainableTsOp.getConf().getTableMetadata().getMetaTable();
            String metaTable2 = discardableTsOp.getConf().getTableMetadata().getMetaTable();
            if (metaTable1 != null || metaTable2 != null) {
                LOG.info("Skip the schema merging as the query contains Iceberg metadata table.");
                continue;
            }
            if (!schemaMerge && !compatibleSchema(retainableTsOp, discardableTsOp)) {
                LOG.debug("incompatible schemas: {} {} for {} (and merge disabled)", discardableTsOp, retainableTsOp, tableName1);
                continue;
            }
            if (mode == Mode.RemoveSemijoin) {
                // We check if the two table scan operators can actually be merged modulo SJs.
                // Hence, two conditions should be met:
                // (i) the TS ops should be mergeable excluding any kind of DPP, and
                // (ii) the DPP branches (excluding SJs) should be the same
                boolean mergeable = areMergeable(pctx, retainableTsOp, discardableTsOp);
                if (!mergeable) {
                    // Skip
                    LOG.debug("{} and {} cannot be merged", retainableTsOp, discardableTsOp);
                    continue;
                }
                boolean validMerge = areMergeableExcludeSemijoinsExtendedCheck(pctx, optimizerCache, retainableTsOp, discardableTsOp);
                if (!validMerge) {
                    // Skip
                    LOG.debug("{} and {} do not meet preconditions", retainableTsOp, discardableTsOp);
                    continue;
                }
                // If tests pass, we create the shared work optimizer additional information
                // about the part of the tree that can be merged. We need to regenerate the
                // cache because semijoin operators have been removed
                sr = extractSharedOptimizationInfoForRoot(pctx, optimizerCache, retainableTsOp, discardableTsOp, true, true);
            } else if (mode == Mode.DPPUnion) {
                boolean mergeable = areMergeable(pctx, retainableTsOp, discardableTsOp);
                if (!mergeable) {
                    LOG.debug("{} and {} cannot be merged", retainableTsOp, discardableTsOp);
                    continue;
                }
                boolean validMerge = areMergeableDppUnion(pctx, optimizerCache, retainableTsOp, discardableTsOp);
                if (!validMerge) {
                    // Skip
                    LOG.debug("{} and {} do not meet preconditions", retainableTsOp, discardableTsOp);
                    continue;
                }
                // If tests pass, we create the shared work optimizer additional information
                // about the part of the tree that can be merged. We need to regenerate the
                // cache because semijoin operators have been removed
                sr = extractSharedOptimizationInfoForRoot(pctx, optimizerCache, retainableTsOp, discardableTsOp, false, false);
                if (!validPreConditions(pctx, optimizerCache, sr)) {
                    continue;
                }
            } else if (mode == Mode.SubtreeMerge) {
                // First we quickly check if the two table scan operators can actually be merged
                if (!areMergeable(pctx, retainableTsOp, discardableTsOp) || !areMergeableExtendedCheck(pctx, optimizerCache, retainableTsOp, discardableTsOp)) {
                    // Skip
                    LOG.debug("{} and {} cannot be merged", retainableTsOp, discardableTsOp);
                    continue;
                }
                // Secondly, we extract information about the part of the tree that can be merged
                // as well as some structural information (memory consumption) that needs to be
                // used to determined whether the merge can happen
                sr = extractSharedOptimizationInfoForRoot(pctx, optimizerCache, retainableTsOp, discardableTsOp, true, true);
                // tables.
                if (!validPreConditions(pctx, optimizerCache, sr)) {
                    // Skip
                    LOG.debug("{} and {} do not meet preconditions", retainableTsOp, discardableTsOp);
                    continue;
                }
            } else {
                throw new RuntimeException("unhandled mode: " + mode);
            }
            // We can merge
            mergedExecuted = true;
            if (mode != Mode.DPPUnion && sr.retainableOps.size() > 1) {
                // More than TS operator
                Operator<?> lastRetainableOp = sr.retainableOps.get(sr.retainableOps.size() - 1);
                Operator<?> lastDiscardableOp = sr.discardableOps.get(sr.discardableOps.size() - 1);
                if (lastDiscardableOp.getNumChild() != 0) {
                    List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(lastDiscardableOp.getChildOperators());
                    for (Operator<? extends OperatorDesc> op : allChildren) {
                        lastDiscardableOp.getChildOperators().remove(op);
                        op.replaceParent(lastDiscardableOp, lastRetainableOp);
                        lastRetainableOp.getChildOperators().add(op);
                    }
                }
                LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableTsOp, retainableTsOp);
            } else {
                if (sr.discardableOps.size() > 1) {
                    throw new RuntimeException("we can't discard more in this path");
                }
                DecomposedTs modelR = new DecomposedTs(retainableTsOp);
                DecomposedTs modelD = new DecomposedTs(discardableTsOp);
                // Push filter on top of children for retainable
                pushFilterToTopOfTableScan(optimizerCache, modelR);
                if (mode == Mode.RemoveSemijoin || mode == Mode.SubtreeMerge) {
                    // For RemoveSemiJoin; this will clear the discardable's semijoin filters
                    replaceSemijoinExpressions(discardableTsOp, modelR.getSemiJoinFilter());
                }
                modelD.replaceTabAlias(discardableTsOp.getConf().getAlias(), retainableTsOp.getConf().getAlias());
                // Push filter on top of children for discardable
                pushFilterToTopOfTableScan(optimizerCache, modelD);
                // Obtain filter for shared TS operator
                ExprNodeDesc exprNode = null;
                if (modelR.normalFilterExpr != null && modelD.normalFilterExpr != null) {
                    exprNode = disjunction(modelR.normalFilterExpr, modelD.normalFilterExpr);
                }
                List<ExprNodeDesc> semiJoinExpr = null;
                if (mode == Mode.DPPUnion) {
                    assert modelR.semijoinExprNodes != null;
                    assert modelD.semijoinExprNodes != null;
                    ExprNodeDesc disjunction = disjunction(conjunction(modelR.semijoinExprNodes), conjunction(modelD.semijoinExprNodes));
                    semiJoinExpr = disjunction == null ? null : Lists.newArrayList(disjunction);
                } else {
                    semiJoinExpr = modelR.semijoinExprNodes;
                }
                // Create expression node that will be used for the retainable table scan
                exprNode = conjunction(semiJoinExpr, exprNode);
                // Replace filter
                retainableTsOp.getConf().setFilterExpr((ExprNodeGenericFuncDesc) exprNode);
                // Replace table scan operator
                adoptChildren(retainableTsOp, discardableTsOp);
                LOG.debug("Merging {} into {}", discardableTsOp, retainableTsOp);
            }
            // we are going to eliminate
            if (mode != Mode.DPPUnion) {
                for (Operator<?> op : sr.discardableInputOps) {
                    OperatorUtils.removeOperator(op);
                    optimizerCache.removeOp(op);
                    removedOps.add(op);
                    // Remove DPP predicates
                    if (op instanceof ReduceSinkOperator) {
                        SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
                        if (sjbi != null && !sr.discardableOps.contains(sjbi.getTsOp()) && !sr.discardableInputOps.contains(sjbi.getTsOp())) {
                            GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) op, sjbi.getTsOp());
                            optimizerCache.tableScanToDPPSource.remove(sjbi.getTsOp(), op);
                        }
                    } else if (op instanceof AppMasterEventOperator) {
                        DynamicPruningEventDesc dped = (DynamicPruningEventDesc) op.getConf();
                        if (!sr.discardableOps.contains(dped.getTableScan()) && !sr.discardableInputOps.contains(dped.getTableScan())) {
                            GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) op, dped.getTableScan());
                            optimizerCache.tableScanToDPPSource.remove(dped.getTableScan(), op);
                        }
                    }
                    LOG.debug("Input operator removed: {}", op);
                }
            }
            // Filtered-out rows from one branch might be needed by another branch sharing a TSop
            if (retainableTsOp.getProbeDecodeContext() != null) {
                LOG.debug("Removing probeDecodeCntx for merged TS op {}", retainableTsOp);
                retainableTsOp.setProbeDecodeContext(null);
                retainableTsOp.getConf().setProbeDecodeContext(null);
            }
            // Then we merge the operators of the works we are going to merge
            mergeSchema(discardableTsOp, retainableTsOp);
            if (mode == Mode.DPPUnion) {
                // reparent all
                Collection<Operator<?>> discardableDPP = optimizerCache.tableScanToDPPSource.get(discardableTsOp);
                for (Operator<?> op : discardableDPP) {
                    if (op instanceof ReduceSinkOperator) {
                        SemiJoinBranchInfo sjInfo = pctx.getRsToSemiJoinBranchInfo().get(op);
                        sjInfo.setTableScan(retainableTsOp);
                    } else if (op.getConf() instanceof DynamicPruningEventDesc) {
                        DynamicPruningEventDesc dynamicPruningEventDesc = (DynamicPruningEventDesc) op.getConf();
                        dynamicPruningEventDesc.setTableScan(retainableTsOp);
                    }
                }
                optimizerCache.tableScanToDPPSource.get(retainableTsOp).addAll(discardableDPP);
                discardableDPP.clear();
            }
            optimizerCache.removeOpAndCombineWork(discardableTsOp, retainableTsOp);
            removedOps.add(discardableTsOp);
            // Finally we remove the expression from the tree
            for (Operator<?> op : sr.discardableOps) {
                OperatorUtils.removeOperator(op);
                optimizerCache.removeOp(op);
                removedOps.add(op);
                LOG.debug("Operator removed: {}", op);
            }
            if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_DOWNSTREAM_MERGE)) {
                if (sr.discardableOps.size() == 1) {
                    downStreamMerge(retainableTsOp, optimizerCache, pctx);
                }
            }
            break;
        }
        if (removedOps.contains(discardableTsOp)) {
            // This operator has been removed, remove it from the list of existing operators
            // FIXME: there is no point of this
            retainedScans.remove(discardableTsOp);
        } else {
            // This operator has not been removed, include it in the list of existing operators
            retainedScans.add(discardableTsOp);
        }
    }
    // Remove unused table scan operators
    pctx.getTopOps().entrySet().removeIf((Entry<String, TableScanOperator> e) -> e.getValue().getNumChild() == 0);
    tableScans.removeAll(removedOps);
    return mergedExecuted;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) TableName(org.apache.hadoop.hive.common.TableName) Entry(java.util.Map.Entry) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 7 with DynamicPruningEventDesc

use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.

the class SharedWorkOptimizer method gatherDPPTableScanOps.

/**
 * This method gathers the TS operators with DPP from the context and
 * stores them into the input optimization cache.
 */
private static void gatherDPPTableScanOps(ParseContext pctx, SharedWorkOptimizerCache optimizerCache) throws SemanticException {
    // Find TS operators with partition pruning enabled in plan
    // because these TS may potentially read different data for
    // different pipeline.
    // These can be:
    // 1) TS with DPP.
    // 2) TS with semijoin DPP.
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    Collection<Operator<?>> tableScanOps = Lists.<Operator<?>>newArrayList(topOps.values());
    Set<AppMasterEventOperator> s = OperatorUtils.findOperators(tableScanOps, AppMasterEventOperator.class);
    for (AppMasterEventOperator a : s) {
        if (a.getConf() instanceof DynamicPruningEventDesc) {
            DynamicPruningEventDesc dped = (DynamicPruningEventDesc) a.getConf();
            optimizerCache.tableScanToDPPSource.put(dped.getTableScan(), a);
        }
    }
    for (Entry<ReduceSinkOperator, SemiJoinBranchInfo> e : pctx.getRsToSemiJoinBranchInfo().entrySet()) {
        optimizerCache.tableScanToDPPSource.put(e.getValue().getTsOp(), e.getKey());
    }
    LOG.debug("DPP information stored in the cache: {}", optimizerCache.tableScanToDPPSource);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 8 with DynamicPruningEventDesc

use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.

the class TezCompiler method removeCycleOperator.

private void removeCycleOperator(Set<Operator<?>> component, OptimizeTezProcContext context) throws SemanticException {
    AppMasterEventOperator victimAM = null;
    TableScanOperator victimTS = null;
    ReduceSinkOperator victimRS = null;
    // If there is a hint and no operator is removed then throw error
    boolean hasHint = false;
    boolean removed = false;
    for (Operator<?> o : component) {
        // Look for AppMasterEventOperator or ReduceSinkOperator
        if (o instanceof AppMasterEventOperator) {
            if (victimAM == null || o.getStatistics().getDataSize() < victimAM.getStatistics().getDataSize()) {
                victimAM = (AppMasterEventOperator) o;
                removed = true;
            }
        } else if (o instanceof ReduceSinkOperator) {
            SemiJoinBranchInfo sjInfo = context.parseContext.getRsToSemiJoinBranchInfo().get(o);
            if (sjInfo == null) {
                continue;
            }
            if (sjInfo.getIsHint()) {
                // Skipping because of hint. Mark this info,
                hasHint = true;
                continue;
            }
            TableScanOperator ts = sjInfo.getTsOp();
            // Sanity check
            assert component.contains(ts);
            if (victimRS == null || ts.getStatistics().getDataSize() < victimTS.getStatistics().getDataSize()) {
                victimRS = (ReduceSinkOperator) o;
                victimTS = ts;
                removed = true;
            }
        }
    }
    // Always set the semijoin optimization as victim.
    Operator<?> victim = victimRS;
    if (victimRS == null && victimAM != null) {
        victim = victimAM;
    } else if (victimAM == null) {
    // do nothing
    } else {
        // Cycle consists of atleast one dynamic partition pruning(DPP)
        // optimization and atleast one min/max optimization.
        // DPP is a better optimization unless it ends up scanning the
        // bigger table for keys instead of the smaller table.
        // Get the parent TS of victimRS.
        Operator<?> op = victimRS;
        while (!(op instanceof TableScanOperator)) {
            op = op.getParentOperators().get(0);
        }
        if ((2 * op.getStatistics().getDataSize()) < victimAM.getStatistics().getDataSize()) {
            victim = victimAM;
        }
    }
    if (hasHint && !removed) {
        // There is hint but none of the operators removed. Throw error
        throw new SemanticException("The user hint is causing an operator cycle. Please fix it and retry");
    }
    if (victim == null || (!context.pruningOpsRemovedByPriorOpt.isEmpty() && context.pruningOpsRemovedByPriorOpt.contains(victim))) {
        return;
    }
    GenTezUtils.removeBranch(victim);
    if (victim == victimRS) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Cycle found. Removing semijoin " + OperatorUtils.getOpNamePretty(victimRS) + " - " + OperatorUtils.getOpNamePretty(victimTS));
        }
        GenTezUtils.removeSemiJoinOperator(context.parseContext, victimRS, victimTS);
    } else {
        // at this point we've found the fork in the op pipeline that has the pruning as a child plan.
        LOG.info("Disabling dynamic pruning for: " + ((DynamicPruningEventDesc) victim.getConf()).getTableScan().toString() + ". Needed to break cyclic dependency");
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 9 with DynamicPruningEventDesc

use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.

the class TezCompiler method markSemiJoinForDPP.

private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
    // Stores the Tablescan operators processed to avoid redoing them.
    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
    for (ReduceSinkOperator rs : map.keySet()) {
        SemiJoinBranchInfo sjInfo = map.get(rs);
        TableScanOperator ts = sjInfo.getTsOp();
        if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
            continue;
        }
        // A TS can have multiple branches due to DPP Or Semijoin Opt.
        // Use DFS to traverse all the branches until RS or DPP is hit.
        Deque<Operator<?>> deque = new LinkedList<>();
        deque.add(ts);
        while (!deque.isEmpty()) {
            Operator<?> op = deque.pollLast();
            if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
                // DPP. Now look up nDVs on both sides to see the selectivity.
                // <Parent Ops>-SEL-GB1-RS1-GB2-RS2
                SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
                try {
                    // Get nDVs on Semijoin edge side
                    Statistics stats = selOp.getStatistics();
                    if (stats == null) {
                        // No stats found on semijoin edge, do nothing
                        break;
                    }
                    String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
                    ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
                    if (colStatisticsSJ == null) {
                        // No column stats found for semijoin edge
                        break;
                    }
                    long nDVs = colStatisticsSJ.getCountDistint();
                    if (nDVs > 0) {
                        // Lookup nDVs on TS side.
                        RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
                        // TODO Handle multi column semi-joins as part of HIVE-23934
                        ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
                        FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
                        Statistics filStats = fil.getStatistics();
                        if (filStats == null) {
                            // No stats found on target, do nothing
                            break;
                        }
                        String colName = ExprNodeDescUtils.extractColName(tsExpr);
                        ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
                        if (colStatisticsTarget == null) {
                            // No column stats found on target
                            break;
                        }
                        long nDVsOfTS = colStatisticsTarget.getCountDistint();
                        double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
                        if ((long) nDVsOfTSFactored > nDVs) {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                            }
                            sjInfo.setShouldRemove(false);
                        }
                    }
                } catch (NullPointerException e) {
                    // Do nothing
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                    }
                }
                break;
            }
            if (op instanceof TerminalOperator) {
                // Done with this branch
                continue;
            }
            deque.addAll(op.getChildOperators());
        }
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) LinkedList(java.util.LinkedList) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 10 with DynamicPruningEventDesc

use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.

the class ConvertJoinMapJoin method convertJoinMapJoin.

/*
   * Once we have decided on the map join, the tree would transform from
   *
   *        |                   |
   *       Join               MapJoin
   *       / \                /   \
   *     RS   RS   --->     RS    TS (big table)
   *    /      \           /
   *   TS       TS        TS (small table)
   *
   * for tez.
   */
public MapJoinOperator convertJoinMapJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, boolean removeReduceSink) throws SemanticException {
    // of the constituent reduce sinks.
    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
        if (parentOp instanceof MuxOperator) {
            return null;
        }
    }
    // can safely convert the join to a map join.
    MapJoinOperator mapJoinOp = MapJoinProcessor.convertJoinOpMapJoinOp(context.conf, joinOp, joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp.getConf().getMapAliases(), bigTablePosition, true, removeReduceSink);
    mapJoinOp.getConf().setHybridHashJoin(HiveConf.getBoolVar(context.conf, HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN));
    List<ExprNodeDesc> joinExprs = mapJoinOp.getConf().getKeys().values().iterator().next();
    if (joinExprs.size() == 0) {
        // In case of cross join, we disable hybrid grace hash join
        mapJoinOp.getConf().setHybridHashJoin(false);
    }
    Operator<? extends OperatorDesc> parentBigTableOp = mapJoinOp.getParentOperators().get(bigTablePosition);
    if (parentBigTableOp instanceof ReduceSinkOperator) {
        Operator<?> parentSelectOpOfBigTableOp = parentBigTableOp.getParentOperators().get(0);
        if (removeReduceSink) {
            for (Operator<?> p : parentBigTableOp.getParentOperators()) {
                // we might have generated a dynamic partition operator chain. Since
                // we're removing the reduce sink we need do remove that too.
                Set<Operator<?>> dynamicPartitionOperators = new HashSet<Operator<?>>();
                Map<Operator<?>, AppMasterEventOperator> opEventPairs = new HashMap<>();
                for (Operator<?> c : p.getChildOperators()) {
                    AppMasterEventOperator event = findDynamicPartitionBroadcast(c);
                    if (event != null) {
                        dynamicPartitionOperators.add(c);
                        opEventPairs.put(c, event);
                    }
                }
                for (Operator<?> c : dynamicPartitionOperators) {
                    if (context.pruningOpsRemovedByPriorOpt.isEmpty() || !context.pruningOpsRemovedByPriorOpt.contains(opEventPairs.get(c))) {
                        p.removeChild(c);
                        // at this point we've found the fork in the op pipeline that has the pruning as a child plan.
                        LOG.info("Disabling dynamic pruning for: " + ((DynamicPruningEventDesc) opEventPairs.get(c).getConf()).getTableScan().getName() + ". Need to be removed together with reduce sink");
                    }
                }
                for (Operator<?> op : dynamicPartitionOperators) {
                    context.pruningOpsRemovedByPriorOpt.add(opEventPairs.get(op));
                }
            }
            mapJoinOp.getParentOperators().remove(bigTablePosition);
            if (!(mapJoinOp.getParentOperators().contains(parentBigTableOp.getParentOperators().get(0)))) {
                mapJoinOp.getParentOperators().add(bigTablePosition, parentBigTableOp.getParentOperators().get(0));
            }
            parentBigTableOp.getParentOperators().get(0).removeChild(parentBigTableOp);
        }
        for (Operator<? extends OperatorDesc> op : mapJoinOp.getParentOperators()) {
            if (!(op.getChildOperators().contains(mapJoinOp))) {
                op.getChildOperators().add(mapJoinOp);
            }
            op.getChildOperators().remove(joinOp);
        }
        // join which takes place in a separate task.
        if (context.parseContext.getRsToSemiJoinBranchInfo().size() > 0 && removeReduceSink) {
            removeCycleCreatingSemiJoinOps(mapJoinOp, parentSelectOpOfBigTableOp, context.parseContext);
        }
    }
    return mapJoinOp;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) MuxOperator(org.apache.hadoop.hive.ql.exec.MuxOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) HashMap(java.util.HashMap) MuxOperator(org.apache.hadoop.hive.ql.exec.MuxOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) HashSet(java.util.HashSet)

Aggregations

DynamicPruningEventDesc (org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)16 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)14 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)13 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)13 Operator (org.apache.hadoop.hive.ql.exec.Operator)13 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)12 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)11 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)11 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)9 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)9 HashSet (java.util.HashSet)7 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)7 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)7 TezDummyStoreOperator (org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator)7 LinkedHashSet (java.util.LinkedHashSet)6 SemiJoinBranchInfo (org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo)6 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)6 HashMap (java.util.HashMap)4