use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.
the class CorrelationUtilities method removeReduceSinkForGroupBy.
protected static void removeReduceSinkForGroupBy(ReduceSinkOperator cRS, GroupByOperator cGBYr, ParseContext context, AbstractCorrelationProcCtx procCtx) throws SemanticException {
Operator<?> parent = getSingleParent(cRS);
if ((parent instanceof GroupByOperator) && procCtx.isMapAggr()) {
// pRS-cGBYm-cRS-cGBYr (map aggregation) --> pRS-cGBYr(COMPLETE)
// copies desc of cGBYm to cGBYr and remove cGBYm and cRS
GroupByOperator cGBYm = (GroupByOperator) parent;
cGBYr.getConf().setKeys(ExprNodeDescUtils.backtrack(ExprNodeDescUtils.backtrack(cGBYr.getConf().getKeys(), cGBYr, cRS), cRS, cGBYm));
cGBYr.getConf().setAggregators(cGBYm.getConf().getAggregators());
for (AggregationDesc aggr : cGBYm.getConf().getAggregators()) {
aggr.setMode(GenericUDAFEvaluator.Mode.COMPLETE);
}
cGBYr.setColumnExprMap(cGBYm.getColumnExprMap());
cGBYr.setSchema(cGBYm.getSchema());
} else {
// pRS-cRS-cGBYr (no map aggregation) --> pRS-cGBYr(COMPLETE)
// revert expressions of cGBYr to that of cRS
cGBYr.getConf().setKeys(ExprNodeDescUtils.backtrack(cGBYr.getConf().getKeys(), cGBYr, cRS));
for (AggregationDesc aggr : cGBYr.getConf().getAggregators()) {
aggr.setParameters(ExprNodeDescUtils.backtrack(aggr.getParameters(), cGBYr, cRS));
}
Map<String, ExprNodeDesc> oldMap = cGBYr.getColumnExprMap();
RowSchema oldRS = cGBYr.getSchema();
Map<String, ExprNodeDesc> newMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ColumnInfo> newRS = new ArrayList<ColumnInfo>();
List<String> outputCols = cGBYr.getConf().getOutputColumnNames();
for (int i = 0; i < outputCols.size(); i++) {
String colName = outputCols.get(i);
ColumnInfo colInfo = oldRS.getColumnInfo(colName);
newRS.add(colInfo);
ExprNodeDesc colExpr = ExprNodeDescUtils.backtrack(oldMap.get(colName), cGBYr, cRS);
if (colExpr != null) {
newMap.put(colInfo.getInternalName(), colExpr);
}
}
cGBYr.setColumnExprMap(newMap);
cGBYr.setSchema(new RowSchema(newRS));
}
cGBYr.getConf().setMode(GroupByDesc.Mode.COMPLETE);
removeOperator(cRS, cGBYr, parent, context);
procCtx.addRemovedOperator(cRS);
if ((parent instanceof GroupByOperator) && procCtx.isMapAggr()) {
removeOperator(parent, cGBYr, getSingleParent(parent), context);
procCtx.addRemovedOperator(cGBYr);
}
}
use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.
the class QueryPlanTreeTransformation method applyCorrelation.
/**
* Based on the correlation, we transform the query plan tree (operator tree).
* In here, we first create DemuxOperator and all bottom ReduceSinkOperators
* (bottom means near TableScanOperaotr) in the correlation will be be
* the parents of the DemuxOperaotr. We also reassign tags to those
* ReduceSinkOperators. Then, we use MuxOperators to replace ReduceSinkOperators
* which are not bottom ones in this correlation.
* Example: The original operator tree is ...
* JOIN2
* / \
* RS4 RS5
* / \
* GBY1 JOIN1
* | / \
* RS1 RS2 RS3
* If GBY1, JOIN1, and JOIN2 can be executed in the same reducer
* (optimized by Correlation Optimizer).
* The new operator tree will be ...
* JOIN2
* |
* MUX
* / \
* GBY1 JOIN1
* \ /
* DEMUX
* / | \
* / | \
* / | \
* RS1 RS2 RS3
* @param pCtx
* @param corrCtx
* @param correlation
* @throws SemanticException
*/
protected static void applyCorrelation(ParseContext pCtx, CorrelationNodeProcCtx corrCtx, IntraQueryCorrelation correlation) throws SemanticException {
final List<ReduceSinkOperator> bottomReduceSinkOperators = correlation.getBottomReduceSinkOperators();
final int numReducers = correlation.getNumReducers();
List<Operator<? extends OperatorDesc>> childrenOfDemux = new ArrayList<Operator<? extends OperatorDesc>>();
List<Operator<? extends OperatorDesc>> parentRSsOfDemux = new ArrayList<Operator<? extends OperatorDesc>>();
Map<Integer, Integer> childIndexToOriginalNumParents = new HashMap<Integer, Integer>();
List<TableDesc> keysSerializeInfos = new ArrayList<TableDesc>();
List<TableDesc> valuessSerializeInfos = new ArrayList<TableDesc>();
Map<ReduceSinkOperator, Integer> bottomRSToNewTag = new HashMap<ReduceSinkOperator, Integer>();
int newTag = 0;
CompilationOpContext opCtx = null;
for (ReduceSinkOperator rsop : bottomReduceSinkOperators) {
if (opCtx == null) {
opCtx = rsop.getCompilationOpContext();
}
rsop.getConf().setNumReducers(numReducers);
bottomRSToNewTag.put(rsop, newTag);
parentRSsOfDemux.add(rsop);
keysSerializeInfos.add(rsop.getConf().getKeySerializeInfo());
valuessSerializeInfos.add(rsop.getConf().getValueSerializeInfo());
Operator<? extends OperatorDesc> child = CorrelationUtilities.getSingleChild(rsop, true);
if (!childrenOfDemux.contains(child)) {
childrenOfDemux.add(child);
int childIndex = childrenOfDemux.size() - 1;
childIndexToOriginalNumParents.put(childIndex, child.getNumParent());
}
newTag++;
}
for (ReduceSinkOperator rsop : bottomReduceSinkOperators) {
setNewTag(correlation, childrenOfDemux, rsop, bottomRSToNewTag);
}
// Create the DemuxOperaotr
DemuxDesc demuxDesc = new DemuxDesc(correlation.getNewTagToOldTag(), correlation.getNewTagToChildIndex(), childIndexToOriginalNumParents, keysSerializeInfos, valuessSerializeInfos);
Operator<? extends OperatorDesc> demuxOp = OperatorFactory.get(opCtx, demuxDesc);
demuxOp.setChildOperators(childrenOfDemux);
demuxOp.setParentOperators(parentRSsOfDemux);
for (Operator<? extends OperatorDesc> child : childrenOfDemux) {
List<Operator<? extends OperatorDesc>> parentsWithMultipleDemux = new ArrayList<Operator<? extends OperatorDesc>>();
boolean hasBottomReduceSinkOperators = false;
boolean hasNonBottomReduceSinkOperators = false;
for (int i = 0; i < child.getParentOperators().size(); i++) {
Operator<? extends OperatorDesc> p = child.getParentOperators().get(i);
assert p instanceof ReduceSinkOperator;
ReduceSinkOperator rsop = (ReduceSinkOperator) p;
if (bottomReduceSinkOperators.contains(rsop)) {
hasBottomReduceSinkOperators = true;
parentsWithMultipleDemux.add(demuxOp);
} else {
hasNonBottomReduceSinkOperators = true;
parentsWithMultipleDemux.add(rsop);
}
}
if (hasBottomReduceSinkOperators && hasNonBottomReduceSinkOperators) {
child.setParentOperators(parentsWithMultipleDemux);
} else {
child.setParentOperators(Utilities.makeList(demuxOp));
}
}
for (Operator<? extends OperatorDesc> parent : parentRSsOfDemux) {
parent.setChildOperators(Utilities.makeList(demuxOp));
}
// replace all ReduceSinkOperators which are not at the bottom of
// this correlation to MuxOperators
Set<ReduceSinkOperator> handledRSs = new HashSet<ReduceSinkOperator>();
for (ReduceSinkOperator rsop : correlation.getAllReduceSinkOperators()) {
if (!bottomReduceSinkOperators.contains(rsop)) {
if (handledRSs.contains(rsop)) {
continue;
}
Operator<? extends OperatorDesc> childOP = CorrelationUtilities.getSingleChild(rsop, true);
if (childOP instanceof GroupByOperator) {
CorrelationUtilities.removeReduceSinkForGroupBy(rsop, (GroupByOperator) childOP, pCtx, corrCtx);
List<Operator<? extends OperatorDesc>> parentsOfMux = new ArrayList<Operator<? extends OperatorDesc>>();
Operator<? extends OperatorDesc> parentOp = CorrelationUtilities.getSingleParent(childOP, true);
parentsOfMux.add(parentOp);
Operator<? extends OperatorDesc> mux = OperatorFactory.get(childOP.getCompilationOpContext(), new MuxDesc(parentsOfMux));
mux.setChildOperators(Utilities.makeList(childOP));
mux.setParentOperators(parentsOfMux);
childOP.setParentOperators(Utilities.makeList(mux));
parentOp.setChildOperators(Utilities.makeList(mux));
} else {
List<Operator<? extends OperatorDesc>> parentsOfMux = new ArrayList<Operator<? extends OperatorDesc>>();
List<Operator<? extends OperatorDesc>> siblingOPs = CorrelationUtilities.findSiblingOperators(rsop);
for (Operator<? extends OperatorDesc> op : siblingOPs) {
if (op instanceof DemuxOperator) {
parentsOfMux.add(op);
} else if (op instanceof ReduceSinkOperator) {
GroupByOperator pGBYm = CorrelationUtilities.getSingleParent(op, GroupByOperator.class);
if (pGBYm != null && pGBYm.getConf().getMode() == GroupByDesc.Mode.HASH) {
// We get a semi join at here.
// This map-side GroupByOperator needs to be removed
CorrelationUtilities.removeOperator(pGBYm, op, CorrelationUtilities.getSingleParent(pGBYm, true), pCtx);
}
handledRSs.add((ReduceSinkOperator) op);
parentsOfMux.add(CorrelationUtilities.getSingleParent(op, true));
} else {
throw new SemanticException("A sibling of ReduceSinkOperator is neither a " + "DemuxOperator nor a ReduceSinkOperator");
}
}
MuxDesc muxDesc = new MuxDesc(siblingOPs);
Operator<? extends OperatorDesc> mux = OperatorFactory.get(rsop.getCompilationOpContext(), muxDesc);
mux.setChildOperators(Utilities.makeList(childOP));
mux.setParentOperators(parentsOfMux);
for (Operator<? extends OperatorDesc> op : parentsOfMux) {
if (op instanceof DemuxOperator) {
// and childOP.
if (op.getChildOperators().contains(childOP)) {
op.replaceChild(childOP, mux);
}
} else {
// op is not a DemuxOperator, so it should have
// a single child.
op.setChildOperators(Utilities.makeList(mux));
}
}
childOP.setParentOperators(Utilities.makeList(mux));
}
}
}
for (ReduceSinkOperator rsop : handledRSs) {
rsop.setChildOperators(null);
rsop.setParentOperators(null);
}
}
use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.
the class RewriteQueryUsingAggregateIndexCtx method replaceGroupByOperatorProcess.
/**
* We need to replace the count(indexed_column_key) GenericUDAF aggregation
* function for group-by construct to "sum" GenericUDAF. This method creates a
* new operator tree for a sample query that creates a GroupByOperator with
* sum aggregation function and uses that GroupByOperator information to
* replace the original GroupByOperator aggregation information. It replaces
* the AggregationDesc (aggregation descriptor) of the old GroupByOperator
* with the new Aggregation Desc of the new GroupByOperator.
* @return
*/
private void replaceGroupByOperatorProcess(GroupByOperator operator, int index) throws SemanticException {
RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;
// We need to replace the GroupByOperator which is before RS
if (index == 0) {
// the query contains the sum aggregation GenericUDAF
String selReplacementCommand = "select sum(`" + rewriteQueryCtx.getAggregateFunction() + "`)" + " from `" + rewriteQueryCtx.getIndexName() + "` group by " + rewriteQueryCtx.getIndexKey() + " ";
// retrieve the operator tree for the query, and the required GroupByOperator from it
Operator<?> newOperatorTree = RewriteParseContextGenerator.generateOperatorTree(rewriteQueryCtx.getParseContext().getQueryState(), selReplacementCommand);
// we get our new GroupByOperator here
GroupByOperator newGbyOperator = OperatorUtils.findLastOperatorUpstream(newOperatorTree, GroupByOperator.class);
if (newGbyOperator == null) {
throw new SemanticException("Error replacing GroupBy operator.");
}
// we need this information to set the correct colList, outputColumnNames
// in SelectOperator
ExprNodeColumnDesc aggrExprNode = null;
// Construct the new AggregationDesc to get rid of the current
// internal names and replace them with new internal names
// as required by the operator tree
GroupByDesc newConf = newGbyOperator.getConf();
List<AggregationDesc> newAggrList = newConf.getAggregators();
if (newAggrList != null && newAggrList.size() > 0) {
for (AggregationDesc aggregationDesc : newAggrList) {
rewriteQueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator());
aggrExprNode = (ExprNodeColumnDesc) aggregationDesc.getParameters().get(0);
rewriteQueryCtx.setAggrExprNode(aggrExprNode);
}
}
// Now the GroupByOperator has the new AggregationList;
// sum(`_count_of_indexed_key`)
// instead of count(indexed_key)
GroupByDesc oldConf = operator.getConf();
oldConf.setAggregators((ArrayList<AggregationDesc>) newAggrList);
operator.setConf(oldConf);
} else {
// we just need to reset the GenericUDAFEvaluator and its name for this
// GroupByOperator whose parent is the ReduceSinkOperator
GroupByDesc childConf = operator.getConf();
List<AggregationDesc> childAggrList = childConf.getAggregators();
if (childAggrList != null && childAggrList.size() > 0) {
for (AggregationDesc aggregationDesc : childAggrList) {
List<ExprNodeDesc> paraList = aggregationDesc.getParameters();
List<ObjectInspector> parametersOIList = new ArrayList<ObjectInspector>();
for (ExprNodeDesc expr : paraList) {
parametersOIList.add(expr.getWritableObjectInspector());
}
GenericUDAFEvaluator evaluator = FunctionRegistry.getGenericUDAFEvaluator("sum", parametersOIList, false, false);
aggregationDesc.setGenericUDAFEvaluator(evaluator);
aggregationDesc.setGenericUDAFName("sum");
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.
the class HiveGBOpConvUtil method genReduceGBRS.
private static OpAttr genReduceGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ArrayList<String> outputColumnNames = new ArrayList<String>();
ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
GroupByOperator reduceSideGB1 = (GroupByOperator) inputOpAf.inputs.get(0);
List<ColumnInfo> gb1ColInfoLst = reduceSideGB1.getSchema().getSignature();
ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForRS(reduceSideGB1, 0, gbInfo.gbKeys.size() - 1, outputColumnNames, false, colInfoLst, colExprMap, true, true);
if (inclGrpSetInReduceSide(gbInfo)) {
addGrpSetCol(false, gb1ColInfoLst.get(reduceKeys.size()).getInternalName(), true, reduceKeys, outputColumnNames, colInfoLst, colExprMap);
}
ArrayList<ExprNodeDesc> reduceValues = getValueKeysForRS(reduceSideGB1, reduceSideGB1.getConf().getKeys().size(), outputColumnNames, colInfoLst, colExprMap, true, true);
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, true, -1, getNumPartFieldsForReduceSideRS(gbInfo), getParallelismForReduceSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), reduceSideGB1);
rsOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet<Integer>(), rsOp);
}
use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanGroupByOperator2MR.
/**
* Generate the second GroupByOperator for the Group By Plan
* (parseInfo.getXXX(dest)). The new GroupByOperator will do the second
* aggregation based on the partial aggregation results.
*
* @param mode
* the mode of aggregation (FINAL)
* @param genericUDAFEvaluators
* The mapping from Aggregation StringTree to the
* genericUDAFEvaluator.
* @return the new GroupByOperator
* @throws SemanticException
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator2MR(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo2, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, boolean groupingSetsPresent) throws SemanticException {
RowResolver groupByInputRowResolver2 = opParseCtx.get(reduceSinkOperatorInfo2).getRowResolver();
RowResolver groupByOutputRowResolver2 = new RowResolver();
groupByOutputRowResolver2.setIsExprResolver(true);
ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
ArrayList<String> outputColumnNames = new ArrayList<String>();
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ColumnInfo exprInfo = groupByInputRowResolver2.getExpression(grpbyExpr);
if (exprInfo == null) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
}
String expression = exprInfo.getInternalName();
groupByKeys.add(new ExprNodeColumnDesc(exprInfo.getType(), expression, exprInfo.getTabAlias(), exprInfo.getIsVirtualCol()));
String field = getColumnInternalName(i);
outputColumnNames.add(field);
ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
groupByOutputRowResolver2.putExpression(grpbyExpr, oColInfo);
addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo2, groupByOutputRowResolver2);
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
int groupingSetsPosition = -1;
// For grouping sets, add a dummy grouping key
if (groupingSetsPresent) {
groupingSetsPosition = groupByKeys.size();
addGroupingSetKey(groupByKeys, groupByInputRowResolver2, groupByOutputRowResolver2, outputColumnNames, colExprMap);
}
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
boolean containsDistinctAggr = false;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
ASTNode value = entry.getValue();
ColumnInfo paraExprInfo = groupByInputRowResolver2.getExpression(value);
if (paraExprInfo == null) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(value));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
String aggName = unescapeIdentifier(value.getChild(0).getText());
boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
containsDistinctAggr = containsDistinctAggr || isDistinct;
boolean isStar = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && value.getToken().getType() == HiveParser.TOK_FUNCTIONDI), amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver2.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, null, false, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver2.getColumnInfos()), reduceSinkOperatorInfo2), groupByOutputRowResolver2);
op.setColumnExprMap(colExprMap);
return op;
}
Aggregations