use of org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanMapAggrNoSkew.
/**
* Generate a Group-By plan using 1 map-reduce job. First perform a map-side
* partial aggregation (to reduce the amount of data), at this point of time,
* we may turn off map-side partial aggregation based on its performance. Then
* spray by the group by key, and sort by the distinct key (if any), and
* compute aggregates based on actual aggregates
*
* The aggregation evaluation functions are as follows:
*
* No grouping sets:
* Group By Operator:
* grouping keys: group by expressions if no DISTINCT
* grouping keys: group by expressions + distinct keys if DISTINCT
* Mapper: iterate/terminatePartial (mode = HASH)
* Partitioning Key: grouping key
* Sorting Key: grouping key if no DISTINCT
* grouping + distinct key if DISTINCT
* Reducer: iterate/terminate if DISTINCT
* merge/terminate if NO DISTINCT (mode MERGEPARTIAL)
*
* Grouping Sets:
* Group By Operator:
* grouping keys: group by expressions + grouping id. if no DISTINCT
* grouping keys: group by expressions + grouping id. + distinct keys if DISTINCT
* Mapper: iterate/terminatePartial (mode = HASH)
* Partitioning Key: grouping key + grouping id.
* Sorting Key: grouping key + grouping id. if no DISTINCT
* grouping + grouping id. + distinct key if DISTINCT
* Reducer: iterate/terminate if DISTINCT
* merge/terminate if NO DISTINCT (mode MERGEPARTIAL)
*
* Grouping Sets with an additional MR job introduced (distincts are not allowed):
* Group By Operator:
* grouping keys: group by expressions
* Mapper: iterate/terminatePartial (mode = HASH)
* Partitioning Key: grouping key
* Sorting Key: grouping key
* Reducer: merge/terminate (mode MERGEPARTIAL)
* Group by Operator:
* grouping keys: group by expressions + add a new grouping id. key
*
* STAGE 2
* Partitioning Key: grouping key + grouping id.
* Sorting Key: grouping key + grouping id.
* Reducer: merge/terminate (mode = FINAL)
* Group by Operator:
* grouping keys: group by expressions + grouping id.
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanMapAggrNoSkew(String dest, QB qb, Operator inputOperatorInfo) throws SemanticException {
QBParseInfo parseInfo = qb.getParseInfo();
ObjectPair<List<ASTNode>, List<Integer>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
List<Integer> groupingSets = grpByExprsGroupingSets.getSecond();
boolean groupingSetsPresent = !groupingSets.isEmpty();
int newMRJobGroupingSetsThreshold = conf.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
if (groupingSetsPresent) {
checkExpressionsForGroupingSet(grpByExprs, parseInfo.getDistinctFuncExprsForClause(dest), parseInfo.getAggregationExprsForClause(dest), opParseCtx.get(inputOperatorInfo).getRowResolver());
}
// ////// Generate GroupbyOperator for a map-side partial aggregation
Map<String, GenericUDAFEvaluator> genericUDAFEvaluators = new LinkedHashMap<String, GenericUDAFEvaluator>();
// Is the grouping sets data consumed in the current in MR job, or
// does it need an additional MR job
boolean groupingSetsNeedAdditionalMRJob = groupingSetsPresent && groupingSets.size() > newMRJobGroupingSetsThreshold ? true : false;
GroupByOperator groupByOperatorInfo = (GroupByOperator) genGroupByPlanMapGroupByOperator(qb, dest, grpByExprs, inputOperatorInfo, GroupByDesc.Mode.HASH, genericUDAFEvaluators, groupingSets, groupingSetsPresent && !groupingSetsNeedAdditionalMRJob);
groupOpToInputTables.put(groupByOperatorInfo, opParseCtx.get(inputOperatorInfo).getRowResolver().getTableNames());
int numReducers = -1;
// needed
if (grpByExprs.isEmpty()) {
numReducers = 1;
}
// ////// Generate ReduceSink Operator
boolean isDistinct = !qb.getParseInfo().getDistinctFuncExprsForClause(dest).isEmpty();
// Distincts are not allowed with an additional mr job
if (groupingSetsNeedAdditionalMRJob && isDistinct) {
String errorMsg = "The number of rows per input row due to grouping sets is " + groupingSets.size();
throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS.getMsg(errorMsg));
}
Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, grpByExprs.size(), true, numReducers, true, groupingSetsPresent && !groupingSetsNeedAdditionalMRJob);
// Does it require a new MR job for grouping sets
if (!groupingSetsPresent || !groupingSetsNeedAdditionalMRJob) {
// on the reducer.
return genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.MERGEPARTIAL, genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob);
} else {
// Add 'n' rows corresponding to the grouping sets. For each row, create 'n' rows,
// one for each grouping set key. Since map-side aggregation has already been performed,
// the number of rows would have been reduced. Moreover, the rows corresponding to the
// grouping keys come together, so there is a higher chance of finding the rows in the hash
// table.
Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob);
// ////// Generate ReduceSinkOperator2
Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR(parseInfo, dest, groupByOperatorInfo2, grpByExprs.size() + 1, numReducers, groupingSetsPresent);
// ////// Generate GroupbyOperator3
return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, groupingSetsPresent);
}
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanMapAggr2MR.
/**
* Generate a Group-By plan using a 2 map-reduce jobs. However, only 1
* group-by plan is generated if the query involves no grouping key and no
* distincts. In that case, the plan is same as generated by
* genGroupByPlanMapAggr1MR. Otherwise, the following plan is generated: First
* perform a map side partial aggregation (to reduce the amount of data). Then
* spray by the grouping key and distinct key (or a random number, if no
* distinct is present) in hope of getting a uniform distribution, and compute
* partial aggregates grouped by the reduction key (grouping key + distinct
* key). Evaluate partial aggregates first, and spray by the grouping key to
* compute actual aggregates in the second phase.
*
* The aggregation evaluation functions are as follows:
*
* No grouping sets:
* STAGE 1
* Group by Operator:
* grouping keys: group by expressions if no DISTINCT
* grouping keys: group by expressions + distinct keys if DISTINCT
* Mapper: iterate/terminatePartial (mode = HASH)
* Partitioning Key: random() if no DISTINCT
* grouping + distinct key if DISTINCT
* Sorting Key: grouping key if no DISTINCT
* grouping + distinct key if DISTINCT
* Reducer: iterate/terminatePartial if DISTINCT
* merge/terminatePartial if NO DISTINCT (mode = MERGEPARTIAL)
* Group by Operator:
* grouping keys: group by expressions
*
* STAGE 2
* Partitioning Key: grouping key
* Sorting Key: grouping key
* Reducer: merge/terminate (mode = FINAL)
*
* In the presence of grouping sets, the aggregation evaluation functions are as follows:
* STAGE 1
* Group by Operator:
* grouping keys: group by expressions + grouping id. if no DISTINCT
* grouping keys: group by expressions + + grouping id. + distinct keys if DISTINCT
* Mapper: iterate/terminatePartial (mode = HASH)
* Partitioning Key: random() if no DISTINCT
* grouping + grouping id. + distinct key if DISTINCT
* Sorting Key: grouping key + grouping id. if no DISTINCT
* grouping + grouping id. + distinct key if DISTINCT
* Reducer: iterate/terminatePartial if DISTINCT
* merge/terminatePartial if NO DISTINCT (mode = MERGEPARTIAL)
* Group by Operator:
* grouping keys: group by expressions + grouping id.
*
* STAGE 2
* Partitioning Key: grouping key
* Sorting Key: grouping key + grouping id.
* Reducer: merge/terminate (mode = FINAL)
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanMapAggr2MR(String dest, QB qb, Operator inputOperatorInfo) throws SemanticException {
QBParseInfo parseInfo = qb.getParseInfo();
ObjectPair<List<ASTNode>, List<Integer>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
List<Integer> groupingSets = grpByExprsGroupingSets.getSecond();
boolean groupingSetsPresent = !groupingSets.isEmpty();
if (groupingSetsPresent) {
checkExpressionsForGroupingSet(grpByExprs, parseInfo.getDistinctFuncExprsForClause(dest), parseInfo.getAggregationExprsForClause(dest), opParseCtx.get(inputOperatorInfo).getRowResolver());
int newMRJobGroupingSetsThreshold = conf.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
// Turn off skew if an additional MR job is required anyway for grouping sets.
if (groupingSets.size() > newMRJobGroupingSetsThreshold) {
String errorMsg = "The number of rows per input row due to grouping sets is " + groupingSets.size();
throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg));
}
}
// ////// Generate GroupbyOperator for a map-side partial aggregation
Map<String, GenericUDAFEvaluator> genericUDAFEvaluators = new LinkedHashMap<String, GenericUDAFEvaluator>();
GroupByOperator groupByOperatorInfo = (GroupByOperator) genGroupByPlanMapGroupByOperator(qb, dest, grpByExprs, inputOperatorInfo, GroupByDesc.Mode.HASH, genericUDAFEvaluators, groupingSets, groupingSetsPresent);
groupOpToInputTables.put(groupByOperatorInfo, opParseCtx.get(inputOperatorInfo).getRowResolver().getTableNames());
// For eg: select count(1) from T where t.ds = ....
if (!optimizeMapAggrGroupBy(dest, qb)) {
List<ASTNode> distinctFuncExprs = parseInfo.getDistinctFuncExprsForClause(dest);
// ////// Generate ReduceSink Operator
Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, distinctFuncExprs.isEmpty() ? -1 : Integer.MAX_VALUE, false, -1, true, groupingSetsPresent);
// ////// Generate GroupbyOperator for a partial aggregation
Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, genericUDAFEvaluators, groupingSets, groupingSetsPresent, false);
int numReducers = -1;
if (grpByExprs.isEmpty()) {
numReducers = 1;
}
// ////// Generate ReduceSinkOperator2
Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR(parseInfo, dest, groupByOperatorInfo2, grpByExprs.size(), numReducers, groupingSetsPresent);
// ////// Generate GroupbyOperator3
return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, groupingSetsPresent);
} else {
// If there are no grouping keys, grouping sets cannot be present
assert !groupingSetsPresent;
// ////// Generate ReduceSink Operator
Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, grpByExprs.size(), false, 1, true, groupingSetsPresent);
return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, false);
}
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator in project SQLWindowing by hbutani.
the class WindowingTableFunction method execute.
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public void execute(PartitionIterator<Object> pItr, Partition outP) throws WindowingException {
ArrayList<List<?>> oColumns = new ArrayList<List<?>>();
Partition iPart = pItr.getPartition();
StructObjectInspector inputOI;
try {
inputOI = (StructObjectInspector) iPart.getSerDe().getObjectInspector();
} catch (SerDeException se) {
throw new WindowingException(se);
}
try {
for (WindowFunctionDef wFn : wFnDefs) {
boolean processWindow = wFn.getWindow() != null;
pItr.reset();
if (!processWindow) {
GenericUDAFEvaluator fEval = wFn.getEvaluator();
Object[] args = new Object[wFn.getArgs().size()];
AggregationBuffer aggBuffer = fEval.getNewAggregationBuffer();
while (pItr.hasNext()) {
Object row = pItr.next();
int i = 0;
for (ArgDef arg : wFn.getArgs()) {
args[i++] = arg.getExprEvaluator().evaluate(row);
}
fEval.aggregate(aggBuffer, args);
}
Object out = fEval.evaluate(aggBuffer);
WindowFunctionInfo wFnInfo = FunctionRegistry.getWindowFunctionInfo(wFn.getSpec().getName());
if (!wFnInfo.isPivotResult()) {
out = new SameList(iPart.size(), out);
}
oColumns.add((List<?>) out);
} else {
oColumns.add(executeFnwithWindow(getQueryDef(), wFn, iPart));
}
}
for (int i = 0; i < iPart.size(); i++) {
ArrayList oRow = new ArrayList();
Object iRow = iPart.getAt(i);
for (StructField f : inputOI.getAllStructFieldRefs()) {
oRow.add(inputOI.getStructFieldData(iRow, f));
}
for (int j = 0; j < oColumns.size(); j++) {
oRow.add(oColumns.get(j).get(i));
}
outP.append(oRow);
}
} catch (HiveException he) {
throw new WindowingException(he);
}
}
Aggregations