Examples with Statistics - org.apache.hadoop.hive.ql.plan.Statistics

Example 1 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class StatsRulesProcFactory method applyRuntimeStats.

private static Statistics applyRuntimeStats(Context context, Statistics stats, Operator<?> op) {
    if (!((HiveConf) context.getConf()).getBoolVar(ConfVars.HIVE_QUERY_REEXECUTION_ENABLED)) {
        return stats;
    }
    PlanMapper pm = context.getPlanMapper();
    OpTreeSignature treeSig = pm.getSignatureOf(op);
    pm.link(op, treeSig);
    StatsSource statsSource = context.getStatsSource();
    if (!statsSource.canProvideStatsFor(op.getClass())) {
        return stats;
    }
    Optional<OperatorStats> os = statsSource.lookup(treeSig);
    if (!os.isPresent()) {
        return stats;
    }
    LOG.debug("using runtime stats for {}; {}", op, os.get());
    Statistics outStats = stats.clone();
    outStats = outStats.scaleToRowCount(os.get().getOutputRecords(), false);
    outStats.setRuntimeStats(true);
    return outStats;
}

Also used : OpTreeSignature(org.apache.hadoop.hive.ql.optimizer.signature.OpTreeSignature) PlanMapper(org.apache.hadoop.hive.ql.plan.mapper.PlanMapper) OperatorStats(org.apache.hadoop.hive.ql.stats.OperatorStats) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) StatsSource(org.apache.hadoop.hive.ql.plan.mapper.StatsSource)

Example 2 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TezCompiler method removeSemijoinOptimizationByBenefit.

private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) throws SemanticException {
    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
    if (map.isEmpty()) {
        // Nothing to do
        return;
    }
    // Scale down stats for tables with DPP
    Map<FilterOperator, Statistics> adjustedStatsMap = new HashMap<>();
    List<ReduceSinkOperator> semijoinRsToRemove = new ArrayList<>();
    double semijoinReductionThreshold = procCtx.conf.getFloatVar(HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
    // Using SortedSet to make iteration order deterministic
    final Comparator<ReduceSinkOperator> rsOpComp = (ReduceSinkOperator o1, ReduceSinkOperator o2) -> (o1.toString().compareTo(o2.toString()));
    SortedSet<ReduceSinkOperator> semiJoinRsOps = new TreeSet<>(rsOpComp);
    semiJoinRsOps.addAll(map.keySet());
    ListMultimap<FilterOperator, SemijoinOperatorInfo> globalReductionFactorMap = ArrayListMultimap.create();
    while (!semiJoinRsOps.isEmpty()) {
        // We will gather the SJs to keep in the plan in the following map
        Map<FilterOperator, SemijoinOperatorInfo> reductionFactorMap = new HashMap<>();
        SortedSet<ReduceSinkOperator> semiJoinRsOpsNewIter = new TreeSet<>(rsOpComp);
        for (ReduceSinkOperator rs : semiJoinRsOps) {
            SemiJoinBranchInfo sjInfo = map.get(rs);
            if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
                // Semijoin created using hint or marked useful, skip it
                continue;
            }
            // rs is semijoin optimization branch, which should look like <Parent>-SEL-GB1-RS1-GB2-RS2
            SelectOperator sel = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
            // Check the ndv/rows from the SEL vs the destination tablescan the semijoin opt is going to.
            TableScanOperator ts = sjInfo.getTsOp();
            RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
            List<ExprNodeDesc> targetColumns = rti.getTargetColumns();
            // In semijoin branches the SEL operator has the following forms:
            // SEL[c1] - single column semijoin reduction
            // SEL[c1, c2,..., ck, hash(hash(hash(c1, c2),...),ck)] - multi column semijoin reduction
            // The source columns in the above cases are c1, c2,...,ck.
            // We need to exclude the hash(...) expression, if it is present.
            List<ExprNodeDesc> sourceColumns = sel.getConf().getColList().subList(0, targetColumns.size());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Computing BloomFilter cost/benefit for " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts) + " " + targetColumns + " ");
            }
            FilterOperator filterOperator = (FilterOperator) ts.getChildOperators().get(0);
            Statistics filterStats = adjustedStatsMap.get(filterOperator);
            if (filterStats == null && filterOperator.getStatistics() != null) {
                filterStats = filterOperator.getStatistics().clone();
                adjustedStatsMap.put(filterOperator, filterStats);
            }
            double reductionFactor = computeBloomFilterNetBenefit(sel, sourceColumns, filterStats, targetColumns);
            if (reductionFactor < semijoinReductionThreshold) {
                // This semijoin optimization should be removed. Do it after we're done iterating
                semijoinRsToRemove.add(rs);
            } else {
                // This semijoin qualifies, add it to the result set
                if (filterStats != null) {
                    ImmutableSet.Builder<String> colNames = ImmutableSet.builder();
                    for (ExprNodeDesc tsExpr : targetColumns) {
                        Set<ExprNodeColumnDesc> allReferencedColumns = ExprNodeDescUtils.findAllColumnDescs(tsExpr);
                        for (ExprNodeColumnDesc col : allReferencedColumns) {
                            colNames.add(col.getColumn());
                        }
                    }
                    // We check whether there was already another SJ over this TS that was selected
                    // in previous iteration
                    SemijoinOperatorInfo prevResult = reductionFactorMap.get(filterOperator);
                    if (prevResult != null) {
                        if (prevResult.reductionFactor < reductionFactor) {
                            // We should pick up new SJ as its reduction factor is greater than the previous one
                            // that we found. We add the previous RS where SJ was originating to RS ops for new
                            // iteration
                            reductionFactorMap.put(filterOperator, new SemijoinOperatorInfo(rs, filterOperator, filterStats, colNames.build(), reductionFactor));
                            semiJoinRsOpsNewIter.add(prevResult.rsOperator);
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Adding " + OperatorUtils.getOpNamePretty(prevResult.rsOperator) + " for re-iteration");
                            }
                        } else {
                            // We should pick up old SJ. We just need to add new RS where SJ was originating
                            // to RS ops for new iteration
                            semiJoinRsOpsNewIter.add(rs);
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Adding " + OperatorUtils.getOpNamePretty(rs) + " for re-iteration");
                            }
                        }
                    } else {
                        // Another SJ did not exist for this TS, hence just add it to SJs to keep
                        reductionFactorMap.put(filterOperator, new SemijoinOperatorInfo(rs, filterOperator, filterStats, colNames.build(), reductionFactor));
                    }
                }
            }
        }
        for (SemijoinOperatorInfo roi : reductionFactorMap.values()) {
            // This semijoin will be kept
            // We are going to adjust the filter statistics
            long newNumRows = (long) (1.0 - roi.reductionFactor) * roi.filterStats.getNumRows();
            if (LOG.isDebugEnabled()) {
                LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
                LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
            }
            StatsUtils.updateStats(roi.filterStats, newNumRows, true, roi.filterOperator, roi.colNames);
            if (LOG.isDebugEnabled()) {
                LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
            }
            adjustedStatsMap.put(roi.filterOperator, roi.filterStats);
            globalReductionFactorMap.put(roi.filterOperator, roi);
        }
        semiJoinRsOps = semiJoinRsOpsNewIter;
    }
    for (ReduceSinkOperator rs : semijoinRsToRemove) {
        TableScanOperator ts = map.get(rs).getTsOp();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Reduction factor not satisfied for " + OperatorUtils.getOpNamePretty(rs) + "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin optimization.");
        }
        GenTezUtils.removeBranch(rs);
        GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, ts);
    }
    if (!globalReductionFactorMap.isEmpty()) {
        sortSemijoinFilters(procCtx, globalReductionFactorMap);
    }
}

Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) LinkedHashMap(java.util.LinkedHashMap) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ImmutableSet(com.google.common.collect.ImmutableSet) TreeSet(java.util.TreeSet) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)

Example 3 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TezCompiler method getBloomFilterCost.

private static double getBloomFilterCost(SelectOperator sel) {
    double cost = -1;
    Statistics selStats = sel.getStatistics();
    if (selStats != null) {
        cost = selStats.getNumRows();
    // Some other things that could be added here to model cost:
    // Cost of computing/sending partial BloomFilter results? BloomFilterSize * # mappers
    // For reduce-side join, add the cost of the semijoin table scan/dependent tablescans?
    }
    return cost;
}

Also used : AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 4 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TezCompiler method markSemiJoinForDPP.

private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
    // Stores the Tablescan operators processed to avoid redoing them.
    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
    for (ReduceSinkOperator rs : map.keySet()) {
        SemiJoinBranchInfo sjInfo = map.get(rs);
        TableScanOperator ts = sjInfo.getTsOp();
        if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
            continue;
        }
        // A TS can have multiple branches due to DPP Or Semijoin Opt.
        // Use DFS to traverse all the branches until RS or DPP is hit.
        Deque<Operator<?>> deque = new LinkedList<>();
        deque.add(ts);
        while (!deque.isEmpty()) {
            Operator<?> op = deque.pollLast();
            if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
                // DPP. Now look up nDVs on both sides to see the selectivity.
                // <Parent Ops>-SEL-GB1-RS1-GB2-RS2
                SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
                try {
                    // Get nDVs on Semijoin edge side
                    Statistics stats = selOp.getStatistics();
                    if (stats == null) {
                        // No stats found on semijoin edge, do nothing
                        break;
                    }
                    String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
                    ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
                    if (colStatisticsSJ == null) {
                        // No column stats found for semijoin edge
                        break;
                    }
                    long nDVs = colStatisticsSJ.getCountDistint();
                    if (nDVs > 0) {
                        // Lookup nDVs on TS side.
                        RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
                        // TODO Handle multi column semi-joins as part of HIVE-23934
                        ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
                        FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
                        Statistics filStats = fil.getStatistics();
                        if (filStats == null) {
                            // No stats found on target, do nothing
                            break;
                        }
                        String colName = ExprNodeDescUtils.extractColName(tsExpr);
                        ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
                        if (colStatisticsTarget == null) {
                            // No column stats found on target
                            break;
                        }
                        long nDVsOfTS = colStatisticsTarget.getCountDistint();
                        double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
                        if ((long) nDVsOfTSFactored > nDVs) {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                            }
                            sjInfo.setShouldRemove(false);
                        }
                    }
                } catch (NullPointerException e) {
                    // Do nothing
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                    }
                }
                break;
            }
            if (op instanceof TerminalOperator) {
                // Done with this branch
                continue;
            }
            deque.addAll(op.getChildOperators());
        }
    }
}

Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) LinkedList(java.util.LinkedList) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 5 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TestReOptimization method checkUsageOfRuntimeStats.

@SuppressWarnings("rawtypes")
private void checkUsageOfRuntimeStats(IDriver driver, boolean expected) throws CommandProcessorException {
    String query = "select sum(u) from tu join tv on (tu.id_uv=tv.id_uv) where u<10 and v>1";
    PlanMapper pm = getMapperForQuery(driver, query);
    assertEquals(1, driver.getContext().getExecutionIndex());
    List<CommonJoinOperator> allJoin = pm.getAll(CommonJoinOperator.class);
    CommonJoinOperator join = allJoin.iterator().next();
    Statistics joinStat = join.getStatistics();
    assertEquals("expectation of the usage of runtime stats doesn't match", expected, joinStat.isRuntimeStats());
}

Also used : PlanMapper(org.apache.hadoop.hive.ql.plan.mapper.PlanMapper) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) Statistics(org.apache.hadoop.hive.ql.plan.Statistics)

Aggregations

Statistics (org.apache.hadoop.hive.ql.plan.Statistics)23 ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)18 ArrayList (java.util.ArrayList)7 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)7 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)5 List (java.util.List)3 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)3 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)3 MapJoinDesc (org.apache.hadoop.hive.ql.plan.MapJoinDesc)3 PlanMapper (org.apache.hadoop.hive.ql.plan.mapper.PlanMapper)3 HashMap (java.util.HashMap)2 HiveConf (org.apache.hadoop.hive.conf.HiveConf)2 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)2 CommonJoinOperator (org.apache.hadoop.hive.ql.exec.CommonJoinOperator)2 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)2