use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class StatsRulesProcFactory method applyRuntimeStats.
private static Statistics applyRuntimeStats(Context context, Statistics stats, Operator<?> op) {
if (!((HiveConf) context.getConf()).getBoolVar(ConfVars.HIVE_QUERY_REEXECUTION_ENABLED)) {
return stats;
}
PlanMapper pm = context.getPlanMapper();
OpTreeSignature treeSig = pm.getSignatureOf(op);
pm.link(op, treeSig);
StatsSource statsSource = context.getStatsSource();
if (!statsSource.canProvideStatsFor(op.getClass())) {
return stats;
}
Optional<OperatorStats> os = statsSource.lookup(treeSig);
if (!os.isPresent()) {
return stats;
}
LOG.debug("using runtime stats for {}; {}", op, os.get());
Statistics outStats = stats.clone();
outStats = outStats.scaleToRowCount(os.get().getOutputRecords(), false);
outStats.setRuntimeStats(true);
return outStats;
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TezCompiler method removeSemijoinOptimizationByBenefit.
private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) throws SemanticException {
Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
if (map.isEmpty()) {
// Nothing to do
return;
}
// Scale down stats for tables with DPP
Map<FilterOperator, Statistics> adjustedStatsMap = new HashMap<>();
List<ReduceSinkOperator> semijoinRsToRemove = new ArrayList<>();
double semijoinReductionThreshold = procCtx.conf.getFloatVar(HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
// Using SortedSet to make iteration order deterministic
final Comparator<ReduceSinkOperator> rsOpComp = (ReduceSinkOperator o1, ReduceSinkOperator o2) -> (o1.toString().compareTo(o2.toString()));
SortedSet<ReduceSinkOperator> semiJoinRsOps = new TreeSet<>(rsOpComp);
semiJoinRsOps.addAll(map.keySet());
ListMultimap<FilterOperator, SemijoinOperatorInfo> globalReductionFactorMap = ArrayListMultimap.create();
while (!semiJoinRsOps.isEmpty()) {
// We will gather the SJs to keep in the plan in the following map
Map<FilterOperator, SemijoinOperatorInfo> reductionFactorMap = new HashMap<>();
SortedSet<ReduceSinkOperator> semiJoinRsOpsNewIter = new TreeSet<>(rsOpComp);
for (ReduceSinkOperator rs : semiJoinRsOps) {
SemiJoinBranchInfo sjInfo = map.get(rs);
if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
// Semijoin created using hint or marked useful, skip it
continue;
}
// rs is semijoin optimization branch, which should look like <Parent>-SEL-GB1-RS1-GB2-RS2
SelectOperator sel = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
// Check the ndv/rows from the SEL vs the destination tablescan the semijoin opt is going to.
TableScanOperator ts = sjInfo.getTsOp();
RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
List<ExprNodeDesc> targetColumns = rti.getTargetColumns();
// In semijoin branches the SEL operator has the following forms:
// SEL[c1] - single column semijoin reduction
// SEL[c1, c2,..., ck, hash(hash(hash(c1, c2),...),ck)] - multi column semijoin reduction
// The source columns in the above cases are c1, c2,...,ck.
// We need to exclude the hash(...) expression, if it is present.
List<ExprNodeDesc> sourceColumns = sel.getConf().getColList().subList(0, targetColumns.size());
if (LOG.isDebugEnabled()) {
LOG.debug("Computing BloomFilter cost/benefit for " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts) + " " + targetColumns + " ");
}
FilterOperator filterOperator = (FilterOperator) ts.getChildOperators().get(0);
Statistics filterStats = adjustedStatsMap.get(filterOperator);
if (filterStats == null && filterOperator.getStatistics() != null) {
filterStats = filterOperator.getStatistics().clone();
adjustedStatsMap.put(filterOperator, filterStats);
}
double reductionFactor = computeBloomFilterNetBenefit(sel, sourceColumns, filterStats, targetColumns);
if (reductionFactor < semijoinReductionThreshold) {
// This semijoin optimization should be removed. Do it after we're done iterating
semijoinRsToRemove.add(rs);
} else {
// This semijoin qualifies, add it to the result set
if (filterStats != null) {
ImmutableSet.Builder<String> colNames = ImmutableSet.builder();
for (ExprNodeDesc tsExpr : targetColumns) {
Set<ExprNodeColumnDesc> allReferencedColumns = ExprNodeDescUtils.findAllColumnDescs(tsExpr);
for (ExprNodeColumnDesc col : allReferencedColumns) {
colNames.add(col.getColumn());
}
}
// We check whether there was already another SJ over this TS that was selected
// in previous iteration
SemijoinOperatorInfo prevResult = reductionFactorMap.get(filterOperator);
if (prevResult != null) {
if (prevResult.reductionFactor < reductionFactor) {
// We should pick up new SJ as its reduction factor is greater than the previous one
// that we found. We add the previous RS where SJ was originating to RS ops for new
// iteration
reductionFactorMap.put(filterOperator, new SemijoinOperatorInfo(rs, filterOperator, filterStats, colNames.build(), reductionFactor));
semiJoinRsOpsNewIter.add(prevResult.rsOperator);
if (LOG.isDebugEnabled()) {
LOG.debug("Adding " + OperatorUtils.getOpNamePretty(prevResult.rsOperator) + " for re-iteration");
}
} else {
// We should pick up old SJ. We just need to add new RS where SJ was originating
// to RS ops for new iteration
semiJoinRsOpsNewIter.add(rs);
if (LOG.isDebugEnabled()) {
LOG.debug("Adding " + OperatorUtils.getOpNamePretty(rs) + " for re-iteration");
}
}
} else {
// Another SJ did not exist for this TS, hence just add it to SJs to keep
reductionFactorMap.put(filterOperator, new SemijoinOperatorInfo(rs, filterOperator, filterStats, colNames.build(), reductionFactor));
}
}
}
}
for (SemijoinOperatorInfo roi : reductionFactorMap.values()) {
// This semijoin will be kept
// We are going to adjust the filter statistics
long newNumRows = (long) (1.0 - roi.reductionFactor) * roi.filterStats.getNumRows();
if (LOG.isDebugEnabled()) {
LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
}
StatsUtils.updateStats(roi.filterStats, newNumRows, true, roi.filterOperator, roi.colNames);
if (LOG.isDebugEnabled()) {
LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
}
adjustedStatsMap.put(roi.filterOperator, roi.filterStats);
globalReductionFactorMap.put(roi.filterOperator, roi);
}
semiJoinRsOps = semiJoinRsOpsNewIter;
}
for (ReduceSinkOperator rs : semijoinRsToRemove) {
TableScanOperator ts = map.get(rs).getTsOp();
if (LOG.isDebugEnabled()) {
LOG.debug("Reduction factor not satisfied for " + OperatorUtils.getOpNamePretty(rs) + "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin optimization.");
}
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, ts);
}
if (!globalReductionFactorMap.isEmpty()) {
sortSemijoinFilters(procCtx, globalReductionFactorMap);
}
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TezCompiler method getBloomFilterCost.
private static double getBloomFilterCost(SelectOperator sel) {
double cost = -1;
Statistics selStats = sel.getStatistics();
if (selStats != null) {
cost = selStats.getNumRows();
// Some other things that could be added here to model cost:
// Cost of computing/sending partial BloomFilter results? BloomFilterSize * # mappers
// For reduce-side join, add the cost of the semijoin table scan/dependent tablescans?
}
return cost;
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TezCompiler method markSemiJoinForDPP.
private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
// Stores the Tablescan operators processed to avoid redoing them.
Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
for (ReduceSinkOperator rs : map.keySet()) {
SemiJoinBranchInfo sjInfo = map.get(rs);
TableScanOperator ts = sjInfo.getTsOp();
if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
continue;
}
// A TS can have multiple branches due to DPP Or Semijoin Opt.
// Use DFS to traverse all the branches until RS or DPP is hit.
Deque<Operator<?>> deque = new LinkedList<>();
deque.add(ts);
while (!deque.isEmpty()) {
Operator<?> op = deque.pollLast();
if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
// DPP. Now look up nDVs on both sides to see the selectivity.
// <Parent Ops>-SEL-GB1-RS1-GB2-RS2
SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
try {
// Get nDVs on Semijoin edge side
Statistics stats = selOp.getStatistics();
if (stats == null) {
// No stats found on semijoin edge, do nothing
break;
}
String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
if (colStatisticsSJ == null) {
// No column stats found for semijoin edge
break;
}
long nDVs = colStatisticsSJ.getCountDistint();
if (nDVs > 0) {
// Lookup nDVs on TS side.
RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
// TODO Handle multi column semi-joins as part of HIVE-23934
ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
Statistics filStats = fil.getStatistics();
if (filStats == null) {
// No stats found on target, do nothing
break;
}
String colName = ExprNodeDescUtils.extractColName(tsExpr);
ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
if (colStatisticsTarget == null) {
// No column stats found on target
break;
}
long nDVsOfTS = colStatisticsTarget.getCountDistint();
double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
if ((long) nDVsOfTSFactored > nDVs) {
if (LOG.isDebugEnabled()) {
LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
}
sjInfo.setShouldRemove(false);
}
}
} catch (NullPointerException e) {
// Do nothing
if (LOG.isDebugEnabled()) {
LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
}
}
break;
}
if (op instanceof TerminalOperator) {
// Done with this branch
continue;
}
deque.addAll(op.getChildOperators());
}
}
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TestReOptimization method checkUsageOfRuntimeStats.
@SuppressWarnings("rawtypes")
private void checkUsageOfRuntimeStats(IDriver driver, boolean expected) throws CommandProcessorException {
String query = "select sum(u) from tu join tv on (tu.id_uv=tv.id_uv) where u<10 and v>1";
PlanMapper pm = getMapperForQuery(driver, query);
assertEquals(1, driver.getContext().getExecutionIndex());
List<CommonJoinOperator> allJoin = pm.getAll(CommonJoinOperator.class);
CommonJoinOperator join = allJoin.iterator().next();
Statistics joinStat = join.getStatistics();
assertEquals("expectation of the usage of runtime stats doesn't match", expected, joinStat.isRuntimeStats());
}
Aggregations