use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.
the class TestVectorGroupByOperator method buildGroupByDescType.
private static Pair<GroupByDesc, VectorGroupByDesc> buildGroupByDescType(VectorizationContext ctx, String aggregate, GenericUDAFEvaluator.Mode mode, String column, TypeInfo dataType) {
AggregationDesc agg = buildAggregationDesc(ctx, aggregate, mode, column, dataType);
ArrayList<AggregationDesc> aggs = new ArrayList<AggregationDesc>();
aggs.add(agg);
ArrayList<String> outputColumnNames = new ArrayList<String>();
outputColumnNames.add("_col0");
GroupByDesc desc = new GroupByDesc();
VectorGroupByDesc vectorDesc = new VectorGroupByDesc();
desc.setOutputColumnNames(outputColumnNames);
desc.setAggregators(aggs);
vectorDesc.setProcessingMode(ProcessingMode.GLOBAL);
return new Pair<GroupByDesc, VectorGroupByDesc>(desc, vectorDesc);
}
use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.
the class SemanticAnalyzer method genMapGroupByForSemijoin.
private Operator genMapGroupByForSemijoin(List<ASTNode> fields, Operator<?> input) throws SemanticException {
RowResolver groupByInputRowResolver = opParseCtx.get(input).getRowResolver();
RowResolver groupByOutputRowResolver = new RowResolver();
List<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
List<String> outputColumnNames = new ArrayList<String>();
List<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < fields.size(); ++i) {
// get the group by keys to ColumnInfo
ASTNode colName = fields.get(i);
String[] nm;
String[] nm2;
ExprNodeDesc grpByExprNode = genExprNodeDesc(colName, groupByInputRowResolver);
if (grpByExprNode instanceof ExprNodeColumnDesc) {
// In most of the cases, this is a column reference
ExprNodeColumnDesc columnExpr = (ExprNodeColumnDesc) grpByExprNode;
nm = groupByInputRowResolver.reverseLookup(columnExpr.getColumn());
nm2 = groupByInputRowResolver.getAlternateMappings(columnExpr.getColumn());
} else if (grpByExprNode instanceof ExprNodeConstantDesc) {
// However, it can be a constant too. In that case, we need to track
// the column that it originated from in the input operator so we can
// propagate the aliases.
ExprNodeConstantDesc constantExpr = (ExprNodeConstantDesc) grpByExprNode;
String inputCol = constantExpr.getFoldedFromCol();
nm = groupByInputRowResolver.reverseLookup(inputCol);
nm2 = groupByInputRowResolver.getAlternateMappings(inputCol);
} else {
// of the left semijoin
return input;
}
groupByKeys.add(grpByExprNode);
// generate output column names
String field = getColumnInternalName(i);
outputColumnNames.add(field);
ColumnInfo colInfo2 = new ColumnInfo(field, grpByExprNode.getTypeInfo(), "", false);
groupByOutputRowResolver.put(nm[0], nm[1], colInfo2);
if (nm2 != null) {
groupByOutputRowResolver.addMappingOnly(nm2[0], nm2[1], colInfo2);
}
groupByOutputRowResolver.putExpression(colName, colInfo2);
// establish mapping from the output column to the input column
colExprMap.put(field, grpByExprNode);
}
// Generate group-by operator
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
float minReductionHashAggr = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
float minReductionHashAggrLowerBound = HiveConf.getFloatVar(conf, ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.HASH, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, null, false, -1, false), new RowSchema(groupByOutputRowResolver.getColumnInfos()), input), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanGroupByOperator1.
/**
* Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
* The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
*
* @param parseInfo
* @param dest
* @param reduceSinkOperatorInfo
* @param mode
* The mode of the aggregation (MERGEPARTIAL, PARTIAL2)
* @param genericUDAFEvaluators
* The mapping from Aggregation StringTree to the
* genericUDAFEvaluator.
* @param groupingSets
* list of grouping sets
* @param groupingSetsPresent
* whether grouping sets are present in this query
* @param groupingSetsNeedAdditionalMRJob
* whether grouping sets are consumed by this group by
* @return the new GroupByOperator
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Long> groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException {
List<String> outputColumnNames = new ArrayList<String>();
RowResolver groupByInputRowResolver = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
RowResolver groupByOutputRowResolver = new RowResolver();
groupByOutputRowResolver.setIsExprResolver(true);
List<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
List<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
if (exprInfo == null) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), grpbyExpr));
}
groupByKeys.add(new ExprNodeColumnDesc(exprInfo));
String field = getColumnInternalName(i);
outputColumnNames.add(field);
ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo, groupByOutputRowResolver);
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
// This is only needed if a new grouping set key is being created
int groupingSetsPosition = -1;
// For grouping sets, add a dummy grouping key
if (groupingSetsPresent) {
groupingSetsPosition = groupByKeys.size();
// This function is called for GroupBy2 to add grouping id as part of the groupby keys
if (!groupingSetsNeedAdditionalMRJob) {
addGroupingSetKey(groupByKeys, groupByInputRowResolver, groupByOutputRowResolver, outputColumnNames, colExprMap);
} else {
// The grouping set has not yet been processed. Create a new grouping key
// Consider the query: select a,b, count(1) from T group by a,b with cube;
// where it is being executed in 2 map-reduce jobs
// The plan for 1st MR is TableScan -> GroupBy1 -> ReduceSink -> GroupBy2 -> FileSink
// GroupBy1/ReduceSink worked as if grouping sets were not present
// This function is called for GroupBy2 to create new rows for grouping sets
// For each input row (a,b), 4 rows are created for the example above:
// (a,b), (a,null), (null, b), (null, null)
createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
}
}
Map<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
// get the last colName for the reduce KEY
// it represents the column name corresponding to distinct aggr, if any
String lastKeyColName = null;
List<ExprNodeDesc> reduceValues = null;
if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) {
List<String> inputKeyCols = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames();
if (inputKeyCols.size() > 0) {
lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
}
reduceValues = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getValueCols();
}
int numDistinctUDFs = 0;
boolean containsDistinctAggr = false;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ASTNode value = entry.getValue();
String aggName = unescapeIdentifier(value.getChild(0).getText());
List<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI);
containsDistinctAggr = containsDistinctAggr || isDistinct;
// side, so always look for the parameters: d+e
if (isDistinct) {
// 0 is the function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode paraExpr = (ASTNode) value.getChild(i);
ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
if (paraExprInfo == null) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), paraExpr));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
if (lastKeyColName != null) {
// if aggr is distinct, the parameter is name is constructed as
// KEY.lastKeyColName:<tag>._colx
paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
}
ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
if (reduceValue != null) {
// this parameter is a constant
expr = reduceValue;
}
aggParameters.add(expr);
}
} else {
ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value);
if (paraExprInfo == null) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), value));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
}
if (isDistinct) {
numDistinctUDFs++;
}
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = null;
genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && isDistinct), amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
float minReductionHashAggr = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
float minReductionHashAggrLowerBound = HiveConf.getFloatVar(conf, ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
// Nothing special needs to be done for grouping sets if
// this is the final group by operator, and multiple rows corresponding to the
// grouping sets have been generated upstream.
// However, if an addition MR job has been created to handle grouping sets,
// additional rows corresponding to grouping sets need to be created here.
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), reduceSinkOperatorInfo), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.
the class SemiJoinReductionMerge method createGroupBy.
/**
* Creates a group by operator with min, max, and bloomFilter aggregations for every column of the parent.
* <p>
* The method generates two kind of group by operators for intermediate and final aggregations respectively.
* Intermediate aggregations require the parent operator to be a select operator while final aggregations assume that
* the parent is a reduce sink operator.
* </p>
* <p>
* Intermediate group by example.
* <pre>
* Input: SEL[fname, lname, age, hash(fname,lname,age)]
* Output: GBY[min(fname),max(fname),min(lname),max(lname),min(age),max(age),bloom(hash)]
* </pre>
* </p>
* <p>
* Final group by example.
* <pre>
* Input: RS[fname_min,fname_max,lname_min,lname_max,age_min,age_max, hash_bloom]
* Output: GBY[min(fname_min),max(fname_max),min(lname_min),max(lname_max),min(age_min),max(age_max),bloom(hash_bloom)]
* </pre>
* </p>
*/
private static GroupByOperator createGroupBy(SelectOperator selectOp, Operator<?> parentOp, GroupByDesc.Mode gbMode, long bloomEntriesHint, HiveConf hiveConf) {
final List<ExprNodeDesc> params;
final GenericUDAFEvaluator.Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, false);
switch(gbMode) {
case FINAL:
params = createGroupByAggregationParameters((ReduceSinkOperator) parentOp);
break;
case HASH:
params = createGroupByAggregationParameters(selectOp);
break;
default:
throw new AssertionError(gbMode.toString() + " is not supported");
}
List<AggregationDesc> gbAggs = new ArrayList<>();
Deque<ExprNodeDesc> paramsCopy = new ArrayDeque<>(params);
while (paramsCopy.size() > 1) {
gbAggs.add(minAggregation(udafMode, paramsCopy.poll()));
gbAggs.add(maxAggregation(udafMode, paramsCopy.poll()));
}
gbAggs.add(bloomFilterAggregation(udafMode, paramsCopy.poll(), selectOp, bloomEntriesHint, hiveConf));
assert paramsCopy.size() == 0;
List<String> gbOutputNames = new ArrayList<>(gbAggs.size());
List<ColumnInfo> gbColInfos = new ArrayList<>(gbAggs.size());
for (int i = 0; i < params.size(); i++) {
String colName = HiveConf.getColumnInternalName(i);
gbOutputNames.add(colName);
final TypeInfo colType;
if (i == params.size() - 1) {
// Bloom type
colType = TypeInfoFactory.binaryTypeInfo;
} else {
// Min/Max type
colType = params.get(i).getTypeInfo();
}
gbColInfos.add(new ColumnInfo(colName, colType, "", false));
}
float groupByMemoryUsage = HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
float minReductionHashAggr = HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
float minReductionHashAggrLowerBound = HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
GroupByDesc groupBy = new GroupByDesc(gbMode, gbOutputNames, Collections.emptyList(), gbAggs, false, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, null, false, -1, false);
groupBy.setColumnExprMap(Collections.emptyMap());
return (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, new RowSchema(gbColInfos), parentOp);
}
use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.
the class SemiJoinReductionMerge method bloomFilterAggregation.
private static AggregationDesc bloomFilterAggregation(GenericUDAFEvaluator.Mode mode, ExprNodeDesc col, SelectOperator source, long numEntriesHint, HiveConf conf) {
GenericUDAFBloomFilterEvaluator bloomFilterEval = new GenericUDAFBloomFilterEvaluator();
bloomFilterEval.setSourceOperator(source);
bloomFilterEval.setMaxEntries(conf.getLongVar(HiveConf.ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(conf.getLongVar(HiveConf.ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(conf.getFloatVar(HiveConf.ConfVars.TEZ_BLOOM_FILTER_FACTOR));
bloomFilterEval.setHintEntries(numEntriesHint);
List<ExprNodeDesc> p = Collections.singletonList(col);
AggregationDesc bloom = new AggregationDesc("bloom_filter", bloomFilterEval, p, false, mode);
// It is necessary to set the bloom filter evaluator otherwise there are runtime failures see HIVE-24018
bloom.setGenericUDAFWritableEvaluator(bloomFilterEval);
return bloom;
}
Aggregations