use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SamplingOptimizer method resolve.
public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
for (Task<?> task : pctx.getRootTasks()) {
if (!(task instanceof MapRedTask) || !((MapRedTask) task).getWork().isFinalMapRed()) {
// this could be replaced by bucketing on RS + bucketed fetcher for next MR
continue;
}
MapredWork mrWork = ((MapRedTask) task).getWork();
MapWork mapWork = mrWork.getMapWork();
ReduceWork reduceWork = mrWork.getReduceWork();
if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) {
continue;
}
// GROUPBY operator in reducer may not be processed in parallel. Skip optimizing.
if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) {
continue;
}
Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
if (!(operator instanceof TableScanOperator)) {
continue;
}
TableScanOperator tsop = (TableScanOperator) operator;
Table tbl = tsop.getConf().getTableMetadata();
if (tbl == null) {
continue;
}
if (AcidUtils.isInsertOnlyTable(tbl.getParameters())) {
// sampler will limit the input to the the correct directories, but we don't care about MR.
continue;
}
ReduceSinkOperator child = OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
if (child == null || child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
continue;
}
child.getConf().setNumReducers(-1);
reduceWork.setNumReduceTasks(-1);
mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
}
return pctx;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class TezCompiler method removeSemiJoinCyclesDueToMapsideJoins.
private static void removeSemiJoinCyclesDueToMapsideJoins(OptimizeTezProcContext procCtx) throws SemanticException {
if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
return;
}
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("R1", MapJoinOperator.getOperatorName() + "%" + MapJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
opRules.put(new RuleRegExp("R2", MapJoinOperator.getOperatorName() + "%" + CommonMergeJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
opRules.put(new RuleRegExp("R3", CommonMergeJoinOperator.getOperatorName() + "%" + MapJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
opRules.put(new RuleRegExp("R4", CommonMergeJoinOperator.getOperatorName() + "%" + CommonMergeJoinOperator.getOperatorName() + "%"), new SemiJoinCycleRemovalDueToMapsideJoins());
SemiJoinCycleRemovalDueTOMapsideJoinContext ctx = new SemiJoinCycleRemovalDueTOMapsideJoinContext();
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, ctx);
List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(procCtx.parseContext.getTopOps().values());
GraphWalker ogw = new PreOrderOnceWalker(disp);
ogw.startWalking(topNodes, null);
// process the list
ParseContext pCtx = procCtx.parseContext;
for (Operator<?> parentJoin : ctx.childParentMap.keySet()) {
Operator<?> childJoin = ctx.childParentMap.get(parentJoin);
if (parentJoin.getChildOperators().size() == 1) {
continue;
}
for (Operator<?> child : parentJoin.getChildOperators()) {
if (!(child instanceof SelectOperator)) {
continue;
}
while (child.getChildOperators().size() > 0) {
child = child.getChildOperators().get(0);
}
if (!(child instanceof ReduceSinkOperator)) {
continue;
}
ReduceSinkOperator rs = ((ReduceSinkOperator) child);
SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
if (sjInfo == null) {
continue;
}
TableScanOperator ts = sjInfo.getTsOp();
// cycle with childJoin.
for (Operator<?> parent : childJoin.getParentOperators()) {
if (parent == parentJoin) {
continue;
}
assert parent instanceof ReduceSinkOperator;
while (parent.getParentOperators().size() > 0) {
parent = parent.getParentOperators().get(0);
}
if (parent == ts) {
// We have a cycle!
if (sjInfo.getIsHint()) {
throw new SemanticException("Removing hinted semijoin as it is creating cycles with mapside joins " + rs + " : " + ts);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Semijoin cycle due to mapjoin. Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
}
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
}
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class TezCompiler method connect.
private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components, ParseContext parseContext) {
indexes.put(o, index.get());
lowLinks.put(o, index.get());
index.incrementAndGet();
nodes.push(o);
List<Operator<?>> children;
if (o instanceof AppMasterEventOperator) {
children = new ArrayList<Operator<?>>();
children.addAll(o.getChildOperators());
TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
} else if (o instanceof ReduceSinkOperator) {
// semijoin case
children = new ArrayList<Operator<?>>();
children.addAll(o.getChildOperators());
SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
if (sjInfo != null) {
TableScanOperator ts = sjInfo.getTsOp();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
}
} else {
children = o.getChildOperators();
}
for (Operator<?> child : children) {
if (!indexes.containsKey(child)) {
connect(child, index, nodes, indexes, lowLinks, components, parseContext);
lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
} else if (nodes.contains(child)) {
lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
}
}
if (lowLinks.get(o).equals(indexes.get(o))) {
Set<Operator<?>> component = new LinkedHashSet<Operator<?>>();
components.add(component);
Operator<?> current;
do {
current = nodes.pop();
component.add(current);
} while (current != o);
}
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class GenTezUtils method createReduceWork.
public static ReduceWork createReduceWork(GenTezProcContext context, Operator<?> root, TezWork tezWork) {
assert !root.getParentOperators().isEmpty();
boolean isAutoReduceParallelism = context.conf.getBoolVar(HiveConf.ConfVars.TEZ_AUTO_REDUCER_PARALLELISM);
float maxPartitionFactor = context.conf.getFloatVar(HiveConf.ConfVars.TEZ_MAX_PARTITION_FACTOR);
float minPartitionFactor = context.conf.getFloatVar(HiveConf.ConfVars.TEZ_MIN_PARTITION_FACTOR);
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
ReduceWork reduceWork = new ReduceWork(Utilities.REDUCENAME + context.nextSequenceNumber());
LOG.debug("Adding reduce work (" + reduceWork.getName() + ") for " + root);
reduceWork.setReducer(root);
reduceWork.setNeedsTagging(GenMapRedUtils.needsTagging(reduceWork));
// one parent.
assert context.parentOfRoot instanceof ReduceSinkOperator;
ReduceSinkOperator reduceSink = (ReduceSinkOperator) context.parentOfRoot;
reduceWork.setNumReduceTasks(reduceSink.getConf().getNumReducers());
reduceWork.setSlowStart(reduceSink.getConf().isSlowStart());
reduceWork.setUniformDistribution(reduceSink.getConf().getReducerTraits().contains(UNIFORM));
if (isAutoReduceParallelism && reduceSink.getConf().getReducerTraits().contains(AUTOPARALLEL)) {
// configured limit for reducers
final int maxReducers = context.conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
// estimated number of reducers
final int nReducers = reduceSink.getConf().getNumReducers();
// min we allow tez to pick
int minPartition = Math.max(1, (int) (nReducers * minPartitionFactor));
minPartition = (minPartition > maxReducers) ? maxReducers : minPartition;
// max we allow tez to pick
int maxPartition = Math.max(1, (int) (nReducers * maxPartitionFactor));
maxPartition = (maxPartition > maxReducers) ? maxReducers : maxPartition;
// reduce only if the parameters are significant
if (minPartition < maxPartition && nReducers * minPartitionFactor >= 1.0) {
reduceWork.setAutoReduceParallelism(true);
reduceWork.setMinReduceTasks(minPartition);
reduceWork.setMaxReduceTasks(maxPartition);
} else if (nReducers < maxPartition) {
// the max is good, the min is too low
reduceWork.setNumReduceTasks(maxPartition);
}
}
setupReduceSink(context, reduceWork, reduceSink);
tezWork.add(reduceWork);
TezEdgeProperty edgeProp;
EdgeType edgeType = determineEdgeType(context.preceedingWork, reduceWork, reduceSink);
if (reduceWork.isAutoReduceParallelism()) {
edgeProp = new TezEdgeProperty(context.conf, edgeType, true, reduceWork.isSlowStart(), reduceWork.getMinReduceTasks(), reduceWork.getMaxReduceTasks(), bytesPerReducer);
} else {
edgeProp = new TezEdgeProperty(edgeType);
edgeProp.setSlowStart(reduceWork.isSlowStart());
}
reduceWork.setEdgePropRef(edgeProp);
tezWork.connect(context.preceedingWork, reduceWork, edgeProp);
context.connectedReduceSinks.add(reduceSink);
return reduceWork;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class GenTezUtils method removeUnionOperators.
// removes any union operator and clones the plan
public static void removeUnionOperators(GenTezProcContext context, BaseWork work, int indexForTezUnion) throws SemanticException {
List<Operator<?>> roots = new ArrayList<Operator<?>>();
roots.addAll(work.getAllRootOperators());
if (work.getDummyOps() != null) {
roots.addAll(work.getDummyOps());
}
roots.addAll(context.eventOperatorSet);
// need to clone the plan.
List<Operator<?>> newRoots = SerializationUtilities.cloneOperatorTree(roots, indexForTezUnion);
// we're cloning the operator plan but we're retaining the original work. That means
// that root operators have to be replaced with the cloned ops. The replacement map
// tells you what that mapping is.
BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();
// there's some special handling for dummyOps required. Mapjoins won't be properly
// initialized if their dummy parents aren't initialized. Since we cloned the plan
// we need to replace the dummy operators in the work with the cloned ones.
List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();
Iterator<Operator<?>> it = newRoots.iterator();
for (Operator<?> orig : roots) {
Set<FileSinkOperator> fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class);
for (FileSinkOperator fsOp : fsOpSet) {
context.fileSinkSet.remove(fsOp);
}
Operator<?> newRoot = it.next();
replacementMap.put(orig, newRoot);
if (newRoot instanceof HashTableDummyOperator) {
// dummy ops need to be updated to the cloned ones.
dummyOps.add((HashTableDummyOperator) newRoot);
it.remove();
} else if (newRoot instanceof AppMasterEventOperator) {
// need to restore the original scan.
if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
if (ts == null) {
throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
}
((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
}
it.remove();
} else {
if (newRoot instanceof TableScanOperator) {
if (context.tsToEventMap.containsKey(orig)) {
// we need to update event operators with the cloned table scan
for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
}
}
// This TableScanOperator could be part of semijoin optimization.
Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo = context.parseContext.getRsToSemiJoinBranchInfo();
for (ReduceSinkOperator rs : rsToSemiJoinBranchInfo.keySet()) {
SemiJoinBranchInfo sjInfo = rsToSemiJoinBranchInfo.get(rs);
if (sjInfo.getTsOp() == orig) {
SemiJoinBranchInfo newSJInfo = new SemiJoinBranchInfo((TableScanOperator) newRoot, sjInfo.getIsHint());
rsToSemiJoinBranchInfo.put(rs, newSJInfo);
}
}
}
context.rootToWorkMap.remove(orig);
context.rootToWorkMap.put(newRoot, work);
}
}
// now we remove all the unions. we throw away any branch that's not reachable from
// the current set of roots. The reason is that those branches will be handled in
// different tasks.
Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
operators.addAll(newRoots);
Set<Operator<?>> seen = new HashSet<Operator<?>>();
while (!operators.isEmpty()) {
Operator<?> current = operators.pop();
seen.add(current);
if (current instanceof FileSinkOperator) {
FileSinkOperator fileSink = (FileSinkOperator) current;
// remember it for additional processing later
context.fileSinkSet.add(fileSink);
FileSinkDesc desc = fileSink.getConf();
Path path = desc.getDirName();
List<FileSinkDesc> linked;
if (!context.linkedFileSinks.containsKey(path)) {
linked = new ArrayList<FileSinkDesc>();
context.linkedFileSinks.put(path, linked);
}
linked = context.linkedFileSinks.get(path);
linked.add(desc);
desc.setDirName(new Path(path, AbstractFileMergeOperator.UNION_SUDBIR_PREFIX + linked.size()));
Utilities.FILE_OP_LOGGER.debug("removing union - new desc with " + desc.getDirName() + "; parent " + path);
desc.setLinkedFileSink(true);
desc.setLinkedFileSinkDesc(linked);
}
if (current instanceof AppMasterEventOperator) {
// remember for additional processing later
context.eventOperatorSet.add((AppMasterEventOperator) current);
// mark the original as abandoned. Don't need it anymore.
context.abandonedEventOperatorSet.add((AppMasterEventOperator) replacementMap.inverse().get(current));
}
if (current instanceof UnionOperator) {
Operator<?> parent = null;
int count = 0;
for (Operator<?> op : current.getParentOperators()) {
if (seen.contains(op)) {
++count;
parent = op;
}
}
// we should have been able to reach the union from only one side.
assert count <= 1;
if (parent == null) {
// root operator is union (can happen in reducers)
replacementMap.put(current, current.getChildOperators().get(0));
} else {
parent.removeChildAndAdoptItsChildren(current);
}
}
if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
current.setChildOperators(null);
} else {
operators.addAll(current.getChildOperators());
}
}
LOG.debug("Setting dummy ops for work " + work.getName() + ": " + dummyOps);
work.setDummyOps(dummyOps);
work.replaceRoots(replacementMap);
}
Aggregations