use of org.apache.hadoop.hive.ql.plan.GroupByDesc in project hive by apache.
the class HiveGBOpConvUtil method genReduceSideGB2.
private static OpAttr genReduceSideGB2(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
ArrayList<String> outputColNames = new ArrayList<String>();
ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
String colOutputName = null;
ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature();
ColumnInfo ci;
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, false, false);
for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
ci = rsColInfoLst.get(i);
colOutputName = gbInfo.outputColNames.get(i);
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 1.2 Add GrpSet Col
int groupingSetsPosition = -1;
if (inclGrpSetInReduceSide(gbInfo) && gbInfo.grpIdFunctionNeeded) {
groupingSetsPosition = gbKeys.size();
ExprNodeDesc grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, rsColInfoLst.get(groupingSetsPosition).getInternalName(), null, false);
gbKeys.add(grpSetColExpr);
colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1);
;
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true));
colExprMap.put(colOutputName, grpSetColExpr);
}
// 2. Add UDAF
UDAFAttrs udafAttr;
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
int udafStartPosInGBInfOutputColNames = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() * 2;
int udafStartPosInInputRS = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() + 1;
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
udafAttr = gbInfo.udafAttrs.get(i);
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafStartPosInInputRS + i)));
colOutputName = gbInfo.outputColNames.get(udafStartPosInGBInfOutputColNames + i);
outputColNames.add(colOutputName);
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.FINAL, udafAttr.isDistinctUDAF);
GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, aggParameters);
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, false, udafMode));
colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
}
Operator rsGBOp2 = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.FINAL, outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, false, groupingSetsPosition, gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs);
rsGBOp2.setColumnExprMap(colExprMap);
// TODO: Shouldn't we propgate vc? is it vc col from tab or all vc
return new OpAttr("", new HashSet<Integer>(), rsGBOp2);
}
use of org.apache.hadoop.hive.ql.plan.GroupByDesc in project hive by apache.
the class HiveGBOpConvUtil method genReduceSideGB1.
private static OpAttr genReduceSideGB1(OpAttr inputOpAf, GBInfo gbInfo, boolean computeGrpSet, boolean propagateConstInDistinctUDAF, GroupByDesc.Mode gbMode) throws SemanticException {
ArrayList<String> outputColNames = new ArrayList<String>();
ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
String colOutputName = null;
ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature();
ColumnInfo ci;
boolean finalGB = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB);
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, false, false);
for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
ci = rsColInfoLst.get(i);
if (finalGB) {
colOutputName = gbInfo.outputColNames.get(i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(i);
}
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 1.2 Add GrpSet Col
int groupingSetsColPosition = -1;
if ((!finalGB && gbInfo.grpSets.size() > 0) || (finalGB && gbInfo.grpIdFunctionNeeded)) {
groupingSetsColPosition = gbInfo.gbKeys.size();
if (computeGrpSet) {
// GrpSet Col needs to be constructed
gbKeys.add(new ExprNodeConstantDesc("0"));
} else {
// GrpSet Col already part of input RS
// TODO: Can't we just copy the ExprNodeDEsc from input (Do we need to
// explicitly set table alias to null & VC to false
gbKeys.addAll(ExprNodeDescUtils.genExprNodeDesc(rs, groupingSetsColPosition, groupingSetsColPosition, false, true));
}
colOutputName = SemanticAnalyzer.getColumnInternalName(groupingSetsColPosition);
if (finalGB) {
colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1);
}
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true));
colExprMap.put(colOutputName, gbKeys.get(groupingSetsColPosition));
}
// 2. Walk through UDAF and add them to GB
String lastReduceKeyColName = null;
if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) {
lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames().get(rs.getConf().getOutputKeyColumnNames().size() - 1);
}
int numDistinctUDFs = 0;
int distinctStartPosInReduceKeys = gbKeys.size();
List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
int udafColStartPosInOriginalGB = (gbInfo.grpSets.size() > 0) ? gbInfo.gbKeys.size() * 2 : gbInfo.gbKeys.size();
int udafColStartPosInRS = rs.getConf().getKeyCols().size();
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
if (udafAttr.isDistinctUDAF) {
ColumnInfo rsDistUDAFParamColInfo;
ExprNodeDesc distinctUDAFParam;
ExprNodeDesc constantPropDistinctUDAFParam;
for (int j = 0; j < udafAttr.udafParamsIndxInGBInfoDistExprs.size(); j++) {
rsDistUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j);
String rsDistUDAFParamName = rsDistUDAFParamColInfo.getInternalName();
// TODO: verify if this is needed
if (lastReduceKeyColName != null) {
rsDistUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j);
}
distinctUDAFParam = new ExprNodeColumnDesc(rsDistUDAFParamColInfo.getType(), rsDistUDAFParamName, rsDistUDAFParamColInfo.getTabAlias(), rsDistUDAFParamColInfo.getIsVirtualCol());
if (propagateConstInDistinctUDAF) {
// TODO: Implement propConstDistUDAFParams
constantPropDistinctUDAFParam = SemanticAnalyzer.isConstantParameterInAggregationParameters(rsDistUDAFParamColInfo.getInternalName(), reduceValues);
if (constantPropDistinctUDAFParam != null) {
distinctUDAFParam = constantPropDistinctUDAFParam;
}
}
aggParameters.add(distinctUDAFParam);
}
numDistinctUDFs++;
} else {
aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafColStartPosInRS + i)));
}
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF);
GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, aggParameters);
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (gbMode != GroupByDesc.Mode.FINAL && udafAttr.isDistinctUDAF), udafMode));
if (finalGB) {
colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1);
}
colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
outputColNames.add(colOutputName);
}
// Nothing special needs to be done for grouping sets if
// this is the final group by operator, and multiple rows corresponding to
// the
// grouping sets have been generated upstream.
// However, if an addition MR job has been created to handle grouping sets,
// additional rows corresponding to grouping sets need to be created here.
//TODO: Clean up/refactor assumptions
boolean includeGrpSetInGBDesc = (gbInfo.grpSets.size() > 0) && !finalGB && !(gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT);
Operator rsGBOp = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, gbKeys, aggregations, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, gbInfo.grpSets, includeGrpSetInGBDesc, groupingSetsColPosition, gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs);
rsGBOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet<Integer>(), rsGBOp);
}
use of org.apache.hadoop.hive.ql.plan.GroupByDesc in project hive by apache.
the class HiveGBOpConvUtil method genReduceSideGB1NoMapGB.
/**
* RS-GB0
*
* @param inputOpAf
* @param gbInfo
* @param gbMode
* @return
* @throws SemanticException
*/
private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, GroupByDesc.Mode gbMode) throws SemanticException {
ArrayList<String> outputColNames = new ArrayList<String>();
ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
String colOutputName = null;
ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature();
ColumnInfo ci;
boolean useOriginalGBNames = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW);
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, true, false);
for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
ci = rsColInfoLst.get(i);
if (useOriginalGBNames) {
colOutputName = gbInfo.outputColNames.get(i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(i);
}
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), null, false));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 2. Walk through UDAF and add them to GB
String lastReduceKeyColName = null;
if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) {
lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames().get(rs.getConf().getOutputKeyColumnNames().size() - 1);
}
int numDistinctUDFs = 0;
List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
// the positions in rsColInfoLst are as follows
// --grpkey--,--distkey--,--values--
// but distUDAF may be before/after some non-distUDAF,
// i.e., their positions can be mixed.
// so for all UDAF we first check to see if it is groupby key, if not is it distinct key
// if not it should be value
List<Integer> distinctPositions = new ArrayList<>();
Map<Integer, ArrayList<ExprNodeDesc>> indexToParameter = new TreeMap<>();
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
ColumnInfo rsUDAFParamColInfo;
ExprNodeDesc udafParam;
ExprNodeDesc constantPropDistinctUDAFParam;
for (int j = 0; j < udafAttr.udafParams.size(); j++) {
int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
rsUDAFParamColInfo = rsColInfoLst.get(argPos);
String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j);
}
udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName, rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol());
constantPropDistinctUDAFParam = SemanticAnalyzer.isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), reduceValues);
if (constantPropDistinctUDAFParam != null) {
udafParam = constantPropDistinctUDAFParam;
}
aggParameters.add(udafParam);
}
indexToParameter.put(i, aggParameters);
if (udafAttr.isDistinctUDAF) {
numDistinctUDFs++;
}
}
for (int index : indexToParameter.keySet()) {
UDAFAttrs udafAttr = gbInfo.udafAttrs.get(index);
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF);
GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, indexToParameter.get(index));
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, udafAttr.isDistinctUDAF, udafMode));
if (useOriginalGBNames) {
colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + index);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1);
}
colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
outputColNames.add(colOutputName);
}
Operator rsGB1 = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, false, -1, numDistinctUDFs > 0), new RowSchema(colInfoLst), rs);
rsGB1.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet<Integer>(), rsGB1);
}
use of org.apache.hadoop.hive.ql.plan.GroupByDesc in project hive by apache.
the class TestVectorGroupByOperator method buildKeyGroupByDesc.
private static GroupByDesc buildKeyGroupByDesc(VectorizationContext ctx, String aggregate, String column, TypeInfo dataTypeInfo, String key, TypeInfo keyTypeInfo) {
GroupByDesc desc = buildGroupByDescType(ctx, aggregate, GenericUDAFEvaluator.Mode.PARTIAL1, column, dataTypeInfo);
((VectorGroupByDesc) desc.getVectorDesc()).setProcessingMode(ProcessingMode.HASH);
ExprNodeDesc keyExp = buildColumnDesc(ctx, key, keyTypeInfo);
ArrayList<ExprNodeDesc> keys = new ArrayList<ExprNodeDesc>();
keys.add(keyExp);
desc.setKeys(keys);
desc.getOutputColumnNames().add("_col1");
return desc;
}
use of org.apache.hadoop.hive.ql.plan.GroupByDesc in project hive by apache.
the class TestVectorGroupByOperator method testAggregateStringIterable.
public void testAggregateStringIterable(String aggregateName, Iterable<VectorizedRowBatch> data, Object expected) throws HiveException {
List<String> mapColumnNames = new ArrayList<String>();
mapColumnNames.add("A");
VectorizationContext ctx = new VectorizationContext("name", mapColumnNames);
GroupByDesc desc = buildGroupByDescType(ctx, aggregateName, GenericUDAFEvaluator.Mode.PARTIAL1, "A", TypeInfoFactory.stringTypeInfo);
CompilationOpContext cCtx = new CompilationOpContext();
Operator<? extends OperatorDesc> groupByOp = OperatorFactory.get(cCtx, desc);
VectorGroupByOperator vgo = (VectorGroupByOperator) Vectorizer.vectorizeGroupByOperator(groupByOp, ctx);
FakeCaptureOutputOperator out = FakeCaptureOutputOperator.addCaptureOutputChild(cCtx, vgo);
vgo.initialize(hconf, null);
for (VectorizedRowBatch unit : data) {
vgo.process(unit, 0);
}
vgo.close(false);
List<Object> outBatchList = out.getCapturedRows();
assertNotNull(outBatchList);
assertEquals(1, outBatchList.size());
Object result = outBatchList.get(0);
Validator validator = getValidator(aggregateName);
validator.validate("_total", expected, result);
}
Aggregations