use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class GenMapRedUtils method initUnionPlan.
/**
* Initialize the current union plan.
*
* @param op
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
public static void initUnionPlan(ReduceSinkOperator op, UnionOperator currUnionOp, GenMRProcContext opProcCtx, Task<? extends Serializable> unionTask) throws SemanticException {
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
MapredWork plan = (MapredWork) unionTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
opTaskMap.put(reducer, unionTask);
plan.setReduceWork(new ReduceWork());
plan.getReduceWork().setReducer(reducer);
plan.getReduceWork().setReducer(reducer);
ReduceSinkDesc desc = op.getConf();
plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
if (needsTagging(plan.getReduceWork())) {
plan.getReduceWork().setNeedsTagging(true);
}
initUnionPlan(opProcCtx, currUnionOp, unionTask, false);
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class SparkCompiler method generateTaskTreeHelper.
private void generateTaskTreeHelper(GenSparkProcContext procCtx, List<Node> topNodes) throws SemanticException {
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack. The dispatcher generates the plan from the operator tree
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
GenSparkWork genSparkWork = new GenSparkWork(GenSparkUtils.getUtils());
opRules.put(new RuleRegExp("Split Work - ReduceSink", ReduceSinkOperator.getOperatorName() + "%"), genSparkWork);
opRules.put(new RuleRegExp("Split Work - SparkPartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), genSparkWork);
opRules.put(new TypeRule(MapJoinOperator.class), new SparkReduceSinkMapJoinProc());
opRules.put(new RuleRegExp("Split Work + Move/Merge - FileSink", FileSinkOperator.getOperatorName() + "%"), new CompositeProcessor(new SparkFileSinkProcessor(), genSparkWork));
opRules.put(new RuleRegExp("Handle Analyze Command", TableScanOperator.getOperatorName() + "%"), new SparkProcessAnalyzeTable(GenSparkUtils.getUtils()));
opRules.put(new RuleRegExp("Remember union", UnionOperator.getOperatorName() + "%"), new NodeProcessor() {
@Override
public Object process(Node n, Stack<Node> s, NodeProcessorCtx procCtx, Object... os) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procCtx;
UnionOperator union = (UnionOperator) n;
// simply need to remember that we've seen a union.
context.currentUnionOperators.add(union);
return null;
}
});
/**
* SMB join case: (Big) (Small) (Small)
* TS TS TS
* \ | /
* \ DS DS
* \ | /
* SMBJoinOP
*
* Some of the other processors are expecting only one traversal beyond SMBJoinOp.
* We need to traverse from the big-table path only, and stop traversing on the
* small-table path once we reach SMBJoinOp.
* Also add some SMB join information to the context, so we can properly annotate
* the MapWork later on.
*/
opRules.put(new TypeRule(SMBMapJoinOperator.class), new NodeProcessor() {
@Override
public Object process(Node currNode, Stack<Node> stack, NodeProcessorCtx procCtx, Object... os) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procCtx;
SMBMapJoinOperator currSmbNode = (SMBMapJoinOperator) currNode;
SparkSMBMapJoinInfo smbMapJoinCtx = context.smbMapJoinCtxMap.get(currSmbNode);
if (smbMapJoinCtx == null) {
smbMapJoinCtx = new SparkSMBMapJoinInfo();
context.smbMapJoinCtxMap.put(currSmbNode, smbMapJoinCtx);
}
for (Node stackNode : stack) {
if (stackNode instanceof DummyStoreOperator) {
//If coming from small-table side, do some book-keeping, and skip traversal.
smbMapJoinCtx.smallTableRootOps.add(context.currentRootOperator);
return true;
}
}
//If coming from big-table side, do some book-keeping, and continue traversal
smbMapJoinCtx.bigTableRootOp = context.currentRootOperator;
return false;
}
});
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
ogw.startWalking(topNodes, null);
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class GenTezUtils method removeUnionOperators.
// removes any union operator and clones the plan
public static void removeUnionOperators(GenTezProcContext context, BaseWork work, int indexForTezUnion) throws SemanticException {
List<Operator<?>> roots = new ArrayList<Operator<?>>();
roots.addAll(work.getAllRootOperators());
if (work.getDummyOps() != null) {
roots.addAll(work.getDummyOps());
}
roots.addAll(context.eventOperatorSet);
// need to clone the plan.
List<Operator<?>> newRoots = SerializationUtilities.cloneOperatorTree(roots, indexForTezUnion);
// we're cloning the operator plan but we're retaining the original work. That means
// that root operators have to be replaced with the cloned ops. The replacement map
// tells you what that mapping is.
BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();
// there's some special handling for dummyOps required. Mapjoins won't be properly
// initialized if their dummy parents aren't initialized. Since we cloned the plan
// we need to replace the dummy operators in the work with the cloned ones.
List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();
Iterator<Operator<?>> it = newRoots.iterator();
for (Operator<?> orig : roots) {
Set<FileSinkOperator> fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class);
for (FileSinkOperator fsOp : fsOpSet) {
context.fileSinkSet.remove(fsOp);
}
Operator<?> newRoot = it.next();
replacementMap.put(orig, newRoot);
if (newRoot instanceof HashTableDummyOperator) {
// dummy ops need to be updated to the cloned ones.
dummyOps.add((HashTableDummyOperator) newRoot);
it.remove();
} else if (newRoot instanceof AppMasterEventOperator) {
// need to restore the original scan.
if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
if (ts == null) {
throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
}
((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
}
it.remove();
} else {
if (newRoot instanceof TableScanOperator) {
if (context.tsToEventMap.containsKey(orig)) {
// we need to update event operators with the cloned table scan
for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
}
}
// This TableScanOperator could be part of semijoin optimization.
Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap = context.parseContext.getRsOpToTsOpMap();
for (ReduceSinkOperator rs : rsOpToTsOpMap.keySet()) {
if (rsOpToTsOpMap.get(rs) == orig) {
rsOpToTsOpMap.put(rs, (TableScanOperator) newRoot);
}
}
}
context.rootToWorkMap.remove(orig);
context.rootToWorkMap.put(newRoot, work);
}
}
// now we remove all the unions. we throw away any branch that's not reachable from
// the current set of roots. The reason is that those branches will be handled in
// different tasks.
Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
operators.addAll(newRoots);
Set<Operator<?>> seen = new HashSet<Operator<?>>();
while (!operators.isEmpty()) {
Operator<?> current = operators.pop();
seen.add(current);
if (current instanceof FileSinkOperator) {
FileSinkOperator fileSink = (FileSinkOperator) current;
// remember it for additional processing later
context.fileSinkSet.add(fileSink);
FileSinkDesc desc = fileSink.getConf();
Path path = desc.getDirName();
List<FileSinkDesc> linked;
if (!context.linkedFileSinks.containsKey(path)) {
linked = new ArrayList<FileSinkDesc>();
context.linkedFileSinks.put(path, linked);
}
linked = context.linkedFileSinks.get(path);
linked.add(desc);
desc.setDirName(new Path(path, "" + linked.size()));
desc.setLinkedFileSink(true);
desc.setParentDir(path);
desc.setLinkedFileSinkDesc(linked);
}
if (current instanceof AppMasterEventOperator) {
// remember for additional processing later
context.eventOperatorSet.add((AppMasterEventOperator) current);
// mark the original as abandoned. Don't need it anymore.
context.abandonedEventOperatorSet.add((AppMasterEventOperator) replacementMap.inverse().get(current));
}
if (current instanceof UnionOperator) {
Operator<?> parent = null;
int count = 0;
for (Operator<?> op : current.getParentOperators()) {
if (seen.contains(op)) {
++count;
parent = op;
}
}
// we should have been able to reach the union from only one side.
assert count <= 1;
if (parent == null) {
// root operator is union (can happen in reducers)
replacementMap.put(current, current.getChildOperators().get(0));
} else {
parent.removeChildAndAdoptItsChildren(current);
}
}
if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
current.setChildOperators(null);
} else {
operators.addAll(current.getChildOperators());
}
}
LOG.debug("Setting dummy ops for work " + work.getName() + ": " + dummyOps);
work.setDummyOps(dummyOps);
work.replaceRoots(replacementMap);
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class GenTezWorkWalker method walk.
/**
* Walk the given operator.
*
* @param nd operator being walked
*/
@Override
protected void walk(Node nd) throws SemanticException {
List<? extends Node> children = nd.getChildren();
// maintain the stack of operators encountered
opStack.push(nd);
Boolean skip = dispatchAndReturn(nd, opStack);
// save some positional state
Operator<? extends OperatorDesc> currentRoot = ctx.currentRootOperator;
Operator<? extends OperatorDesc> parentOfRoot = ctx.parentOfRoot;
List<UnionOperator> currentUnionOperators = ctx.currentUnionOperators;
BaseWork preceedingWork = ctx.preceedingWork;
if (skip == null || !skip) {
// move all the children to the front of queue
for (Node ch : children) {
// and restore the state before walking each child
ctx.currentRootOperator = currentRoot;
ctx.parentOfRoot = parentOfRoot;
ctx.preceedingWork = preceedingWork;
ctx.currentUnionOperators = new ArrayList<>();
ctx.currentUnionOperators.addAll(currentUnionOperators);
walk(ch);
}
}
// done with this operator
opStack.pop();
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class SemanticAnalyzer method genUnionPlan.
@SuppressWarnings("nls")
private Operator genUnionPlan(String unionalias, String leftalias, Operator leftOp, String rightalias, Operator rightOp) throws SemanticException {
// Currently, the unions are not merged - each union has only 2 parents. So,
// a n-way union will lead to (n-1) union operators.
// This can be easily merged into 1 union
RowResolver leftRR = opParseCtx.get(leftOp).getRowResolver();
RowResolver rightRR = opParseCtx.get(rightOp).getRowResolver();
LinkedHashMap<String, ColumnInfo> leftmap = leftRR.getFieldMap(leftalias);
LinkedHashMap<String, ColumnInfo> rightmap = rightRR.getFieldMap(rightalias);
// make sure the schemas of both sides are the same
ASTNode tabref = qb.getAliases().isEmpty() ? null : qb.getParseInfo().getSrcForAlias(qb.getAliases().get(0));
if (leftmap.size() != rightmap.size()) {
throw new SemanticException("Schema of both sides of union should match.");
}
RowResolver unionoutRR = new RowResolver();
Iterator<Map.Entry<String, ColumnInfo>> lIter = leftmap.entrySet().iterator();
Iterator<Map.Entry<String, ColumnInfo>> rIter = rightmap.entrySet().iterator();
while (lIter.hasNext()) {
Map.Entry<String, ColumnInfo> lEntry = lIter.next();
Map.Entry<String, ColumnInfo> rEntry = rIter.next();
ColumnInfo lInfo = lEntry.getValue();
ColumnInfo rInfo = rEntry.getValue();
// use left alias (~mysql, postgresql)
String field = lEntry.getKey();
// try widening conversion, otherwise fail union
TypeInfo commonTypeInfo = FunctionRegistry.getCommonClassForUnionAll(lInfo.getType(), rInfo.getType());
if (commonTypeInfo == null) {
throw new SemanticException(generateErrorMessage(tabref, "Schema of both sides of union should match: Column " + field + " is of type " + lInfo.getType().getTypeName() + " on first table and type " + rInfo.getType().getTypeName() + " on second table"));
}
ColumnInfo unionColInfo = new ColumnInfo(lInfo);
unionColInfo.setType(commonTypeInfo);
unionoutRR.put(unionalias, field, unionColInfo);
}
// For Spark,TEZ we rely on the generated SelectOperator to do the type casting.
// Consider:
// SEL_1 (int) SEL_2 (int) SEL_3 (double)
// If we first merge SEL_1 and SEL_2 into a UNION_1, and then merge UNION_1
// with SEL_3 to get UNION_2, then no SelectOperator will be inserted. Hence error
// will happen afterwards. The solution here is to insert one after UNION_1, which
// cast int to double.
boolean isMR = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr");
if (!isMR || !(leftOp instanceof UnionOperator)) {
leftOp = genInputSelectForUnion(leftOp, leftmap, leftalias, unionoutRR, unionalias);
}
if (!isMR || !(rightOp instanceof UnionOperator)) {
rightOp = genInputSelectForUnion(rightOp, rightmap, rightalias, unionoutRR, unionalias);
}
// else create a new one
if (leftOp instanceof UnionOperator || (leftOp instanceof SelectOperator && leftOp.getParentOperators() != null && !leftOp.getParentOperators().isEmpty() && leftOp.getParentOperators().get(0) instanceof UnionOperator && ((SelectOperator) leftOp).isIdentitySelect())) {
if (!(leftOp instanceof UnionOperator)) {
Operator oldChild = leftOp;
leftOp = (Operator) leftOp.getParentOperators().get(0);
leftOp.removeChildAndAdoptItsChildren(oldChild);
}
// make left a child of right
List<Operator<? extends OperatorDesc>> child = new ArrayList<Operator<? extends OperatorDesc>>();
child.add(leftOp);
rightOp.setChildOperators(child);
List<Operator<? extends OperatorDesc>> parent = leftOp.getParentOperators();
parent.add(rightOp);
UnionDesc uDesc = ((UnionOperator) leftOp).getConf();
uDesc.setNumInputs(uDesc.getNumInputs() + 1);
return putOpInsertMap(leftOp, unionoutRR);
}
if (rightOp instanceof UnionOperator || (rightOp instanceof SelectOperator && rightOp.getParentOperators() != null && !rightOp.getParentOperators().isEmpty() && rightOp.getParentOperators().get(0) instanceof UnionOperator && ((SelectOperator) rightOp).isIdentitySelect())) {
if (!(rightOp instanceof UnionOperator)) {
Operator oldChild = rightOp;
rightOp = (Operator) rightOp.getParentOperators().get(0);
rightOp.removeChildAndAdoptItsChildren(oldChild);
}
// make right a child of left
List<Operator<? extends OperatorDesc>> child = new ArrayList<Operator<? extends OperatorDesc>>();
child.add(rightOp);
leftOp.setChildOperators(child);
List<Operator<? extends OperatorDesc>> parent = rightOp.getParentOperators();
parent.add(leftOp);
UnionDesc uDesc = ((UnionOperator) rightOp).getConf();
uDesc.setNumInputs(uDesc.getNumInputs() + 1);
return putOpInsertMap(rightOp, unionoutRR);
}
// Create a new union operator
Operator<? extends OperatorDesc> unionforward = OperatorFactory.getAndMakeChild(getOpContext(), new UnionDesc(), new RowSchema(unionoutRR.getColumnInfos()));
// set union operator as child of each of leftOp and rightOp
List<Operator<? extends OperatorDesc>> child = new ArrayList<Operator<? extends OperatorDesc>>();
child.add(unionforward);
rightOp.setChildOperators(child);
child = new ArrayList<Operator<? extends OperatorDesc>>();
child.add(unionforward);
leftOp.setChildOperators(child);
List<Operator<? extends OperatorDesc>> parent = new ArrayList<Operator<? extends OperatorDesc>>();
parent.add(leftOp);
parent.add(rightOp);
unionforward.setParentOperators(parent);
// create operator info list to return
return putOpInsertMap(unionforward, unionoutRR);
}
Aggregations