use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SemanticAnalyzer method genReduceSinkPlan.
private Operator genReduceSinkPlan(String dest, QB qb, Operator<?> input, int numReducers, boolean hasOrderBy) throws SemanticException {
RowResolver inputRR = opParseCtx.get(input).getRowResolver();
// First generate the expression for the partition and sort keys
// The cluster by clause / distribute by clause has the aliases for
// partition function
ASTNode partitionExprs = qb.getParseInfo().getClusterByForClause(dest);
if (partitionExprs == null) {
partitionExprs = qb.getParseInfo().getDistributeByForClause(dest);
}
ArrayList<ExprNodeDesc> partCols = new ArrayList<ExprNodeDesc>();
if (partitionExprs != null) {
int ccount = partitionExprs.getChildCount();
for (int i = 0; i < ccount; ++i) {
ASTNode cl = (ASTNode) partitionExprs.getChild(i);
partCols.add(genExprNodeDesc(cl, inputRR));
}
}
ASTNode sortExprs = qb.getParseInfo().getClusterByForClause(dest);
if (sortExprs == null) {
sortExprs = qb.getParseInfo().getSortByForClause(dest);
}
if (sortExprs == null) {
sortExprs = qb.getParseInfo().getOrderByForClause(dest);
if (sortExprs != null) {
assert numReducers == 1;
// in strict mode, in the presence of order by, limit must be specified
if (qb.getParseInfo().getDestLimit(dest) == null) {
String error = StrictChecks.checkNoLimit(conf);
if (error != null) {
throw new SemanticException(generateErrorMessage(sortExprs, error));
}
}
}
}
ArrayList<ExprNodeDesc> sortCols = new ArrayList<ExprNodeDesc>();
StringBuilder order = new StringBuilder();
StringBuilder nullOrder = new StringBuilder();
if (sortExprs != null) {
int ccount = sortExprs.getChildCount();
for (int i = 0; i < ccount; ++i) {
ASTNode cl = (ASTNode) sortExprs.getChild(i);
if (cl.getType() == HiveParser.TOK_TABSORTCOLNAMEASC) {
// SortBy ASC
order.append("+");
cl = (ASTNode) cl.getChild(0);
if (cl.getType() == HiveParser.TOK_NULLS_FIRST) {
nullOrder.append("a");
} else if (cl.getType() == HiveParser.TOK_NULLS_LAST) {
nullOrder.append("z");
} else {
throw new SemanticException("Unexpected null ordering option: " + cl.getType());
}
cl = (ASTNode) cl.getChild(0);
} else if (cl.getType() == HiveParser.TOK_TABSORTCOLNAMEDESC) {
// SortBy DESC
order.append("-");
cl = (ASTNode) cl.getChild(0);
if (cl.getType() == HiveParser.TOK_NULLS_FIRST) {
nullOrder.append("a");
} else if (cl.getType() == HiveParser.TOK_NULLS_LAST) {
nullOrder.append("z");
} else {
throw new SemanticException("Unexpected null ordering option: " + cl.getType());
}
cl = (ASTNode) cl.getChild(0);
} else {
// ClusterBy
order.append("+");
nullOrder.append("a");
}
ExprNodeDesc exprNode = genExprNodeDesc(cl, inputRR);
sortCols.add(exprNode);
}
}
Operator result = genReduceSinkPlan(input, partCols, sortCols, order.toString(), nullOrder.toString(), numReducers, Operation.NOT_ACID, true);
if (result.getParentOperators().size() == 1 && result.getParentOperators().get(0) instanceof ReduceSinkOperator) {
((ReduceSinkOperator) result.getParentOperators().get(0)).getConf().setHasOrderBy(hasOrderBy);
}
return result;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class GenMRRedSink1 method process.
/**
* Reduce Sink encountered.
* a) If we are seeing this RS for first time, we initialize plan corresponding to this RS.
* b) If we are seeing this RS for second or later time then either query had a join in which
* case we will merge this plan with earlier plan involving this RS or plan for this RS
* needs to be split in two branches.
*
* @param nd
* the reduce sink operator encountered
* @param opProcCtx
* context
*/
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator op = (ReduceSinkOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork currPlan = (MapredWork) currTask.getWork();
String currAliasId = mapredCtx.getCurrAliasId();
if (op.getNumChild() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one child. " + "But found multiple children : " + op.getChildOperators());
}
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
Task<? extends Serializable> oldTask = ctx.getOpTaskMap().get(reducer);
ctx.setCurrAliasId(currAliasId);
ctx.setCurrTask(currTask);
// If the plan for this reducer does not exist, initialize the plan
if (oldTask == null) {
if (currPlan.getReduceWork() == null) {
GenMapRedUtils.initPlan(op, ctx);
} else {
GenMapRedUtils.splitPlan(op, ctx);
}
} else {
// This will happen in case of joins. The current plan can be thrown away
// after being merged with the original plan
GenMapRedUtils.joinPlan(currTask, oldTask, ctx);
currTask = oldTask;
ctx.setCurrTask(currTask);
}
mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
ctx.addRootIfPossible(currTask);
return false;
}
return true;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class GenMRRedSink2 method process.
/**
* Reduce Scan encountered.
*
* @param nd
* the reduce sink operator encountered
* @param opProcCtx
* context
*/
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator op = (ReduceSinkOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
String currAliasId = mapredCtx.getCurrAliasId();
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
Map<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
Task<? extends Serializable> oldTask = opTaskMap.get(reducer);
ctx.setCurrAliasId(currAliasId);
ctx.setCurrTask(currTask);
if (oldTask == null) {
GenMapRedUtils.splitPlan(op, ctx);
} else {
GenMapRedUtils.splitPlan(op, currTask, oldTask, ctx);
currTask = oldTask;
ctx.setCurrTask(currTask);
}
mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
ctx.addRootIfPossible(currTask);
return false;
}
return true;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class GenMRRedSink3 method process.
/**
* Reduce Scan encountered.
*
* @param nd
* the reduce sink operator encountered
* @param opProcCtx
* context
*/
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator op = (ReduceSinkOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
// union consisted on a bunch of map-reduce jobs, and it has been split at
// the union
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
UnionOperator union = Utils.findNode(stack, UnionOperator.class);
assert union != null;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(union);
Task<? extends Serializable> unionTask = null;
if (mapredCtx != null) {
unionTask = mapredCtx.getCurrTask();
} else {
unionTask = ctx.getCurrTask();
}
MapredWork plan = (MapredWork) unionTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
Task<? extends Serializable> reducerTask = opTaskMap.get(reducer);
ctx.setCurrTask(unionTask);
// If the plan for this reducer does not exist, initialize the plan
if (reducerTask == null) {
// When the reducer is encountered for the first time
if (plan.getReduceWork() == null) {
GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
// When union is followed by a multi-table insert
} else {
GenMapRedUtils.splitPlan(op, ctx);
}
} else if (plan.getReduceWork() != null && plan.getReduceWork().getReducer() == reducer) {
// The union is already initialized. However, the union is walked from
// another input
// initUnionPlan is idempotent
GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
} else {
GenMapRedUtils.joinUnionPlan(ctx, union, unionTask, reducerTask, false);
ctx.setCurrTask(reducerTask);
}
mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
// the union operator has been processed
ctx.setCurrUnionOp(null);
return true;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class ConvertJoinMapJoin method checkConvertJoinSMBJoin.
/*
* This method tries to convert a join to an SMB. This is done based on
* traits. If the sorted by columns are the same as the join columns then, we
* can convert the join to an SMB. Otherwise retain the bucket map join as it
* is still more efficient than a regular join.
*/
private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
ReduceSinkOperator bigTableRS = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
int numBuckets = bigTableRS.getParentOperators().get(0).getOpTraits().getNumBuckets();
int size = -1;
for (Operator<?> parentOp : joinOp.getParentOperators()) {
// each side better have 0 or more RS. if either side is unbalanced, cannot convert.
// This is a workaround for now. Right fix would be to refactor code in the
// MapRecordProcessor and ReduceRecordProcessor with respect to the sources.
@SuppressWarnings({ "rawtypes", "unchecked" }) Set<ReduceSinkOperator> set = OperatorUtils.findOperatorsUpstream(parentOp.getParentOperators(), ReduceSinkOperator.class);
if (size < 0) {
size = set.size();
continue;
}
if (((size > 0) && (set.size() > 0)) || ((size == 0) && (set.size() == 0))) {
continue;
} else {
return false;
}
}
// transformation of the join operation
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (!(parentOp instanceof ReduceSinkOperator)) {
// could be mux/demux operators. Currently not supported
LOG.info("Found correlation optimizer operators. Cannot convert to SMB at this time.");
return false;
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp;
if (checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getSortCols(), rsOp.getOpTraits().getSortCols(), rsOp.getColumnExprMap(), tezBucketJoinProcCtx, false) == false) {
LOG.info("We cannot convert to SMB because the sort column names do not match.");
return false;
}
if (checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getBucketColNames(), rsOp.getOpTraits().getBucketColNames(), rsOp.getColumnExprMap(), tezBucketJoinProcCtx, true) == false) {
LOG.info("We cannot convert to SMB because bucket column names do not match.");
return false;
}
}
if (numBuckets < 0) {
numBuckets = bigTableRS.getConf().getNumReducers();
}
tezBucketJoinProcCtx.setNumBuckets(numBuckets);
LOG.info("We can convert the join to an SMB join.");
return true;
}
Aggregations