use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.
the class DynamicPartitionPruningOptimization method generateEventOperatorPlan.
private void generateEventOperatorPlan(DynamicListContext ctx, ParseContext parseContext, TableScanOperator ts, String column, String columnType) {
// we will put a fork in the plan at the source of the reduce sink
Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
// we need the expr that generated the key of the reduce sink
ExprNodeDesc key = ctx.generator.getConf().getKeyCols().get(ctx.desc.getKeyIndex());
// we also need the expr for the partitioned table
ExprNodeDesc partKey = ctx.parent.getChildren().get(0);
if (LOG.isDebugEnabled()) {
LOG.debug("key expr: " + key);
LOG.debug("partition key expr: " + partKey);
}
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
keyExprs.add(key);
// group by requires "ArrayList", don't ask.
ArrayList<String> outputNames = new ArrayList<String>();
outputNames.add(HiveConf.getColumnInternalName(0));
// project the relevant key column
SelectDesc select = new SelectDesc(keyExprs, outputNames);
SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(select, parentOfRS);
// do a group by on the list to dedup
float groupByMemoryUsage = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
ArrayList<ExprNodeDesc> groupByExprs = new ArrayList<ExprNodeDesc>();
ExprNodeDesc groupByExpr = new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), null, false);
groupByExprs.add(groupByExpr);
GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, outputNames, groupByExprs, new ArrayList<AggregationDesc>(), false, groupByMemoryUsage, memoryThreshold, null, false, -1, true);
GroupByOperator groupByOp = (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, selectOp);
Map<String, ExprNodeDesc> colMap = new HashMap<String, ExprNodeDesc>();
colMap.put(outputNames.get(0), groupByExpr);
groupByOp.setColumnExprMap(colMap);
// finally add the event broadcast operator
if (HiveConf.getVar(parseContext.getConf(), ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
DynamicPruningEventDesc eventDesc = new DynamicPruningEventDesc();
eventDesc.setTableScan(ts);
eventDesc.setGenerator(ctx.generator);
eventDesc.setTable(PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(keyExprs, "key")));
eventDesc.setTargetColumnName(column);
eventDesc.setTargetColumnType(columnType);
eventDesc.setPartKey(partKey);
OperatorFactory.getAndMakeChild(eventDesc, groupByOp);
} else {
// Must be spark branch
SparkPartitionPruningSinkDesc desc = new SparkPartitionPruningSinkDesc();
desc.setTable(PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(keyExprs, "key")));
desc.addTarget(column, columnType, partKey, null, ts);
SparkPartitionPruningSinkOperator dppSink = (SparkPartitionPruningSinkOperator) OperatorFactory.getAndMakeChild(desc, groupByOp);
if (HiveConf.getBoolVar(parseContext.getConf(), ConfVars.HIVE_COMBINE_EQUIVALENT_WORK_OPTIMIZATION)) {
mayReuseExistingDPPSink(parentOfRS, Arrays.asList(selectOp, groupByOp, dppSink));
}
}
}
use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.
the class SharedWorkOptimizer method findChildWorkOperators.
private static Set<Operator<?>> findChildWorkOperators(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> start) {
// Find operators in work
Set<Operator<?>> workOps = findWorkOperators(optimizerCache, start);
// Gather output works operators
Set<Operator<?>> set = new HashSet<Operator<?>>();
for (Operator<?> op : workOps) {
if (op instanceof ReduceSinkOperator) {
if (op.getChildOperators() != null) {
// All children of RS are descendants
for (Operator<?> child : op.getChildOperators()) {
set.addAll(findWorkOperators(optimizerCache, child));
}
}
// Semijoin DPP work is considered a child because work needs
// to finish for it to execute
SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
if (sjbi != null) {
set.addAll(findWorkOperators(optimizerCache, sjbi.getTsOp()));
}
} else if (op.getConf() instanceof DynamicPruningEventDesc) {
// DPP work is considered a child because work needs
// to finish for it to execute
set.addAll(findWorkOperators(optimizerCache, ((DynamicPruningEventDesc) op.getConf()).getTableScan()));
}
}
return set;
}
use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.
the class TezCompiler method findParallelSemiJoinBranch.
private boolean findParallelSemiJoinBranch(Operator<?> mapjoin, TableScanOperator bigTableTS, ParseContext parseContext, Map<ReduceSinkOperator, TableScanOperator> semijoins, Map<TableScanOperator, List<MapJoinOperator>> probeDecodeMJoins) {
boolean parallelEdges = false;
for (Operator<?> op : mapjoin.getParentOperators()) {
if (!(op instanceof ReduceSinkOperator)) {
continue;
}
op = op.getParentOperators().get(0);
// Follow the Reducesink operator upstream which is on small table side.
while (!(op instanceof ReduceSinkOperator) && !(op instanceof TableScanOperator) && !(op.getChildren() != null && op.getChildren().size() > 1)) {
if (op instanceof MapJoinOperator) {
// ReduceSink, that is what we are looking for.
for (Operator<?> parentOp : op.getParentOperators()) {
if (parentOp instanceof ReduceSinkOperator) {
continue;
}
// parent in current pipeline
op = parentOp;
continue;
}
}
op = op.getParentOperators().get(0);
}
// Bail out if RS or TS is encountered.
if (op instanceof ReduceSinkOperator || op instanceof TableScanOperator) {
continue;
}
// A branch is hit.
for (Node nd : op.getChildren()) {
if (nd instanceof SelectOperator) {
Operator<?> child = (Operator<?>) nd;
while (child.getChildOperators().size() > 0) {
child = child.getChildOperators().get(0);
}
// If not ReduceSink Op, skip
if (!(child instanceof ReduceSinkOperator)) {
// This still could be DPP.
if (child instanceof AppMasterEventOperator && ((AppMasterEventOperator) child).getConf() instanceof DynamicPruningEventDesc) {
// DPP indeed, Set parallel edges true
parallelEdges = true;
}
continue;
}
ReduceSinkOperator rs = (ReduceSinkOperator) child;
SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(rs);
if (sjInfo == null) {
continue;
}
TableScanOperator ts = sjInfo.getTsOp();
if (ts != bigTableTS) {
// skip, not the one we are looking for.
continue;
}
parallelEdges = true;
// Keep track of Mj to probeDecode TS
if (!probeDecodeMJoins.containsKey(ts)) {
probeDecodeMJoins.put(ts, new ArrayList<>());
}
probeDecodeMJoins.get(ts).add((MapJoinOperator) mapjoin);
// Skip adding to SJ removal map when created by hint
if (!sjInfo.getIsHint() && sjInfo.getShouldRemove()) {
semijoins.put(rs, ts);
}
}
}
}
return parallelEdges;
}
use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.
the class TezCompiler method connect.
private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components, ParseContext parseContext) {
indexes.put(o, index.get());
lowLinks.put(o, index.get());
index.incrementAndGet();
nodes.push(o);
List<Operator<?>> children;
if (o instanceof AppMasterEventOperator) {
children = new ArrayList<>((o.getChildOperators()));
TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
} else if (o instanceof TerminalOperator) {
children = new ArrayList<>((o.getChildOperators()));
for (ReduceSinkOperator rs : parseContext.getTerminalOpToRSMap().get((TerminalOperator<?>) o)) {
// add an edge
LOG.debug("Adding special edge: From terminal op to semijoin edge " + o.getName() + " --> " + rs.toString());
children.add(rs);
}
if (o instanceof ReduceSinkOperator) {
// semijoin case
SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
if (sjInfo != null) {
TableScanOperator ts = sjInfo.getTsOp();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
}
}
} else {
children = o.getChildOperators();
}
for (Operator<?> child : children) {
if (!indexes.containsKey(child)) {
connect(child, index, nodes, indexes, lowLinks, components, parseContext);
lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
} else if (nodes.contains(child)) {
lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
}
}
if (lowLinks.get(o).equals(indexes.get(o))) {
Set<Operator<?>> component = new LinkedHashSet<Operator<?>>();
components.add(component);
Operator<?> current;
do {
current = nodes.pop();
component.add(current);
} while (current != o);
}
}
use of org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc in project hive by apache.
the class SharedWorkOptimizer method findChildWorkOperators.
private static Set<Operator<?>> findChildWorkOperators(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> start, boolean traverseEventOperators) {
// Find operators in work
Set<Operator<?>> workOps = findWorkOperators(optimizerCache, start);
// Gather output works operators
Set<Operator<?>> set = new HashSet<Operator<?>>();
for (Operator<?> op : workOps) {
if (op instanceof ReduceSinkOperator) {
if (op.getChildOperators() != null) {
// All children of RS are descendants
for (Operator<?> child : op.getChildOperators()) {
set.addAll(findWorkOperators(optimizerCache, child));
}
}
// Semijoin DPP work is considered a child because work needs
// to finish for it to execute
SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
if (sjbi != null) {
set.addAll(findWorkOperators(optimizerCache, sjbi.getTsOp()));
}
} else if (op.getConf() instanceof DynamicPruningEventDesc) {
// to finish for it to execute
if (traverseEventOperators) {
set.addAll(findWorkOperators(optimizerCache, ((DynamicPruningEventDesc) op.getConf()).getTableScan()));
}
}
}
return set;
}
Aggregations