use of org.apache.hadoop.hive.ql.lib.RuleRegExp in project hive by apache.
the class BucketingSortingInferenceOptimizer method inferBucketingSorting.
/**
* For each map reduce task, if it has a reducer, infer whether or not the final output of the
* reducer is bucketed and/or sorted
*
* @param mapRedTasks
* @throws SemanticException
*/
private void inferBucketingSorting(List<ExecDriver> mapRedTasks) throws SemanticException {
for (ExecDriver mapRedTask : mapRedTasks) {
// of the outputs of intermediate map reduce jobs.
if (!mapRedTask.getWork().isFinalMapRed()) {
continue;
}
if (mapRedTask.getWork().getReduceWork() == null) {
continue;
}
Operator<? extends OperatorDesc> reducer = mapRedTask.getWork().getReduceWork().getReducer();
// uses sampling, which means it's not bucketed
boolean disableBucketing = mapRedTask.getWork().getMapWork().getSamplingType() > 0;
BucketingSortingCtx bCtx = new BucketingSortingCtx(disableBucketing);
// RuleRegExp rules are used to match operators anywhere in the tree
// RuleExactMatch rules are used to specify exactly what the tree should look like
// In particular, this guarantees that the first operator is the reducer
// (and its parent(s) are ReduceSinkOperators) since it begins walking the tree from
// the reducer.
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("R1", SelectOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getSelProc());
// Matches only GroupByOperators which are reducers, rather than map group by operators,
// or multi group by optimization specific operators
opRules.put(new RuleExactMatch("R2", new String[] { GroupByOperator.getOperatorName() }), BucketingSortingOpProcFactory.getGroupByProc());
// Matches only JoinOperators which are reducers, rather than map joins, SMB map joins, etc.
opRules.put(new RuleExactMatch("R3", new String[] { JoinOperator.getOperatorName() }), BucketingSortingOpProcFactory.getJoinProc());
opRules.put(new RuleRegExp("R5", FileSinkOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getFileSinkProc());
opRules.put(new RuleRegExp("R7", FilterOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getFilterProc());
opRules.put(new RuleRegExp("R8", LimitOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getLimitProc());
opRules.put(new RuleRegExp("R9", LateralViewForwardOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getLateralViewForwardProc());
opRules.put(new RuleRegExp("R10", LateralViewJoinOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getLateralViewJoinProc());
// Matches only ForwardOperators which are preceded by some other operator in the tree,
// in particular it can't be a reducer (and hence cannot be one of the ForwardOperators
// added by the multi group by optimization)
opRules.put(new RuleRegExp("R11", ".+" + ForwardOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getForwardProc());
// Matches only ForwardOperators which are reducers and are followed by GroupByOperators
// (specific to the multi group by optimization)
opRules.put(new RuleExactMatch("R12", new String[] { ForwardOperator.getOperatorName(), GroupByOperator.getOperatorName() }), BucketingSortingOpProcFactory.getMultiGroupByProc());
// The dispatcher fires the processor corresponding to the closest matching rule and passes
// the context along
Dispatcher disp = new DefaultRuleDispatcher(BucketingSortingOpProcFactory.getDefaultProc(), opRules, bCtx);
GraphWalker ogw = new PreOrderWalker(disp);
// Create a list of topop nodes
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.add(reducer);
ogw.startWalking(topNodes, null);
mapRedTask.getWork().getMapWork().getBucketedColsByDirectory().putAll(bCtx.getBucketedColsByDirectory());
mapRedTask.getWork().getMapWork().getSortedColsByDirectory().putAll(bCtx.getSortedColsByDirectory());
}
}
use of org.apache.hadoop.hive.ql.lib.RuleRegExp in project hive by apache.
the class NullScanOptimizer method resolve.
@Override
public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%.*" + FilterOperator.getOperatorName() + "%"), new WhereFalseProcessor());
Dispatcher disp = new NullScanTaskDispatcher(pctx, opRules);
GraphWalker ogw = new DefaultGraphWalker(disp);
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(pctx.getRootTasks());
ogw.startWalking(topNodes, null);
opRules.clear();
opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%"), new TSMarker());
opRules.put(new RuleRegExp("R2", LimitOperator.getOperatorName() + "%"), new Limit0Processor());
disp = new NullScanTaskDispatcher(pctx, opRules);
ogw = new DefaultGraphWalker(disp);
topNodes = new ArrayList<Node>();
topNodes.addAll(pctx.getRootTasks());
ogw.startWalking(topNodes, null);
return pctx;
}
use of org.apache.hadoop.hive.ql.lib.RuleRegExp in project hive by apache.
the class TezCompiler method removeSemiJoinCyclesDueToMapsideJoins.
private static void removeSemiJoinCyclesDueToMapsideJoins(OptimizeTezProcContext procCtx) throws SemanticException {
if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
return;
}
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("R1", MapJoinOperator.getOperatorName() + "%" + MapJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
opRules.put(new RuleRegExp("R2", MapJoinOperator.getOperatorName() + "%" + CommonMergeJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
opRules.put(new RuleRegExp("R3", CommonMergeJoinOperator.getOperatorName() + "%" + MapJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
opRules.put(new RuleRegExp("R4", CommonMergeJoinOperator.getOperatorName() + "%" + CommonMergeJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
SemiJoinCycleRemovalDueTOMapsideJoinContext ctx = new SemiJoinCycleRemovalDueTOMapsideJoinContext();
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, ctx);
List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(procCtx.parseContext.getTopOps().values());
GraphWalker ogw = new PreOrderOnceWalker(disp);
ogw.startWalking(topNodes, null);
// process the list
ParseContext pCtx = procCtx.parseContext;
for (Operator<?> parentJoin : ctx.childParentMap.keySet()) {
Operator<?> childJoin = ctx.childParentMap.get(parentJoin);
if (parentJoin.getChildOperators().size() == 1) {
continue;
}
for (Operator<?> child : parentJoin.getChildOperators()) {
if (!(child instanceof SelectOperator)) {
continue;
}
while (child.getChildOperators().size() > 0) {
child = child.getChildOperators().get(0);
}
if (!(child instanceof ReduceSinkOperator)) {
continue;
}
ReduceSinkOperator rs = ((ReduceSinkOperator) child);
SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
if (sjInfo == null) {
continue;
}
TableScanOperator ts = sjInfo.getTsOp();
// cycle with childJoin.
for (Operator<?> parent : childJoin.getParentOperators()) {
if (parent == parentJoin) {
continue;
}
assert parent instanceof ReduceSinkOperator;
while (parent.getParentOperators().size() > 0) {
parent = parent.getParentOperators().get(0);
}
if (parent == ts) {
// We have a cycle!
if (sjInfo.getIsHint()) {
throw new SemanticException("Removing hinted semijoin as it is creating cycles with mapside joins " + rs + " : " + ts);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Semijoin cycle due to mapjoin. Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
}
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
}
}
}
}
}
use of org.apache.hadoop.hive.ql.lib.RuleRegExp in project hive by apache.
the class TezCompiler method removeSemiJoinIfNoStats.
private void removeSemiJoinIfNoStats(OptimizeTezProcContext procCtx) throws SemanticException {
if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) {
// Not needed without semi-join reduction
return;
}
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + ReduceSinkOperator.getOperatorName() + "%" + GroupByOperator.getOperatorName() + "%" + ReduceSinkOperator.getOperatorName() + "%"), new SemiJoinRemovalIfNoStatsProc());
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(procCtx.parseContext.getTopOps().values());
GraphWalker ogw = new PreOrderOnceWalker(disp);
ogw.startWalking(topNodes, null);
}
use of org.apache.hadoop.hive.ql.lib.RuleRegExp in project hive by apache.
the class TezCompiler method runStatsDependentOptimizations.
private void runStatsDependentOptimizations(OptimizeTezProcContext procCtx, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
// Sequence of TableScan operators to be walked
Deque<Operator<?>> deque = new LinkedList<Operator<?>>();
deque.addAll(procCtx.parseContext.getTopOps().values());
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack.
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("Set parallelism - ReduceSink", ReduceSinkOperator.getOperatorName() + "%"), new SetReducerParallelism());
opRules.put(new RuleRegExp("Convert Join to Map-join", JoinOperator.getOperatorName() + "%"), new ConvertJoinMapJoin());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(procCtx.parseContext.getTopOps().values());
GraphWalker ogw = new ForwardWalker(disp);
ogw.startWalking(topNodes, null);
}
Aggregations