use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.
the class SetSparkReducerParallelism method needSetParallelism.
// tests whether the RS needs automatic setting parallelism
private boolean needSetParallelism(ReduceSinkOperator reduceSink, HiveConf hiveConf) {
ReduceSinkDesc desc = reduceSink.getConf();
if (desc.getNumReducers() <= 0) {
return true;
}
if (desc.getNumReducers() == 1 && desc.hasOrderBy() && hiveConf.getBoolVar(HiveConf.ConfVars.HIVESAMPLINGFORORDERBY) && !desc.isDeduplicated()) {
Stack<Operator<? extends OperatorDesc>> descendants = new Stack<Operator<? extends OperatorDesc>>();
List<Operator<? extends OperatorDesc>> children = reduceSink.getChildOperators();
if (children != null) {
for (Operator<? extends OperatorDesc> child : children) {
descendants.push(child);
}
}
while (descendants.size() != 0) {
Operator<? extends OperatorDesc> descendant = descendants.pop();
// If the decendants contains LimitOperator,return false
if (descendant instanceof LimitOperator) {
return false;
}
boolean reachTerminalOperator = (descendant instanceof TerminalOperator);
if (!reachTerminalOperator) {
List<Operator<? extends OperatorDesc>> childrenOfDescendant = descendant.getChildOperators();
if (childrenOfDescendant != null) {
for (Operator<? extends OperatorDesc> childOfDescendant : childrenOfDescendant) {
descendants.push(childOfDescendant);
}
}
}
}
return true;
}
return false;
}
use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.
the class TezCompiler method markSemiJoinForDPP.
private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
// Stores the Tablescan operators processed to avoid redoing them.
Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
for (ReduceSinkOperator rs : map.keySet()) {
SemiJoinBranchInfo sjInfo = map.get(rs);
TableScanOperator ts = sjInfo.getTsOp();
if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
continue;
}
// A TS can have multiple branches due to DPP Or Semijoin Opt.
// Use DFS to traverse all the branches until RS or DPP is hit.
Deque<Operator<?>> deque = new LinkedList<>();
deque.add(ts);
while (!deque.isEmpty()) {
Operator<?> op = deque.pollLast();
if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
// DPP. Now look up nDVs on both sides to see the selectivity.
// <Parent Ops>-SEL-GB1-RS1-GB2-RS2
SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
try {
// Get nDVs on Semijoin edge side
Statistics stats = selOp.getStatistics();
if (stats == null) {
// No stats found on semijoin edge, do nothing
break;
}
String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
if (colStatisticsSJ == null) {
// No column stats found for semijoin edge
break;
}
long nDVs = colStatisticsSJ.getCountDistint();
if (nDVs > 0) {
// Lookup nDVs on TS side.
RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
// TODO Handle multi column semi-joins as part of HIVE-23934
ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
Statistics filStats = fil.getStatistics();
if (filStats == null) {
// No stats found on target, do nothing
break;
}
String colName = ExprNodeDescUtils.extractColName(tsExpr);
ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
if (colStatisticsTarget == null) {
// No column stats found on target
break;
}
long nDVsOfTS = colStatisticsTarget.getCountDistint();
double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
if ((long) nDVsOfTSFactored > nDVs) {
if (LOG.isDebugEnabled()) {
LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
}
sjInfo.setShouldRemove(false);
}
}
} catch (NullPointerException e) {
// Do nothing
if (LOG.isDebugEnabled()) {
LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
}
}
break;
}
if (op instanceof TerminalOperator) {
// Done with this branch
continue;
}
deque.addAll(op.getChildOperators());
}
}
}
use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.
the class TezCompiler method connect.
private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components, ParseContext parseContext) {
indexes.put(o, index.get());
lowLinks.put(o, index.get());
index.incrementAndGet();
nodes.push(o);
List<Operator<?>> children;
if (o instanceof AppMasterEventOperator) {
children = new ArrayList<>((o.getChildOperators()));
TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
} else if (o instanceof TerminalOperator) {
children = new ArrayList<>((o.getChildOperators()));
for (ReduceSinkOperator rs : parseContext.getTerminalOpToRSMap().get((TerminalOperator<?>) o)) {
// add an edge
LOG.debug("Adding special edge: From terminal op to semijoin edge " + o.getName() + " --> " + rs.toString());
children.add(rs);
}
if (o instanceof ReduceSinkOperator) {
// semijoin case
SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
if (sjInfo != null) {
TableScanOperator ts = sjInfo.getTsOp();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
}
}
} else {
children = o.getChildOperators();
}
for (Operator<?> child : children) {
if (!indexes.containsKey(child)) {
connect(child, index, nodes, indexes, lowLinks, components, parseContext);
lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
} else if (nodes.contains(child)) {
lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
}
}
if (lowLinks.get(o).equals(indexes.get(o))) {
Set<Operator<?>> component = new LinkedHashSet<Operator<?>>();
components.add(component);
Operator<?> current;
do {
current = nodes.pop();
component.add(current);
} while (current != o);
}
}
use of org.apache.hadoop.hive.ql.exec.TerminalOperator in project hive by apache.
the class TezCompiler method connectTerminalOps.
private void connectTerminalOps(ParseContext pCtx) {
// The map which contains the virtual edges from non-semijoin terminal ops to semjoin RSs.
Multimap<TerminalOperator<?>, ReduceSinkOperator> terminalOpToRSMap = ArrayListMultimap.create();
// Map of semijoin RS to work ops to ensure no work is examined more than once.
Map<ReduceSinkOperator, TerminalOpsInfo> rsToTerminalOpsInfo = new HashMap<>();
// Get all the terminal ops
for (ReduceSinkOperator rs : pCtx.getRsToSemiJoinBranchInfo().keySet()) {
TerminalOpsInfo terminalOpsInfo = rsToTerminalOpsInfo.get(rs);
if (terminalOpsInfo != null) {
// done with this one
continue;
}
Set<ReduceSinkOperator> workRSOps = new HashSet<>();
Set<TerminalOperator<?>> workTerminalOps = new HashSet<>();
// Get the SEL Op in the semijoin-branch, SEL->GBY1->RS1->GBY2->RS2
SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
OperatorUtils.findWorkOperatorsAndSemiJoinEdges(selOp, pCtx.getRsToSemiJoinBranchInfo(), workRSOps, workTerminalOps);
TerminalOpsInfo candidate = new TerminalOpsInfo(workTerminalOps);
// A work may contain multiple semijoin edges, traverse rsOps and add for each
for (ReduceSinkOperator rsFound : workRSOps) {
rsToTerminalOpsInfo.put(rsFound, candidate);
for (TerminalOperator<?> terminalOp : candidate.terminalOps) {
terminalOpToRSMap.put(terminalOp, rsFound);
}
}
}
pCtx.setTerminalOpToRSMap(terminalOpToRSMap);
}
Aggregations