Search in sources :

Example 1 with AggregationDesc

use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanGroupByOperator1.

/**
   * Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
   * The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
   *
   * @param parseInfo
   * @param dest
   * @param reduceSinkOperatorInfo
   * @param mode
   *          The mode of the aggregation (MERGEPARTIAL, PARTIAL2)
   * @param genericUDAFEvaluators
   *          The mapping from Aggregation StringTree to the
   *          genericUDAFEvaluator.
   * @param groupingSets
   *          list of grouping sets
   * @param groupingSetsPresent
   *          whether grouping sets are present in this query
   * @param groupingSetsNeedAdditionalMRJob
   *          whether grouping sets are consumed by this group by
   * @return the new GroupByOperator
   */
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Integer> groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException {
    ArrayList<String> outputColumnNames = new ArrayList<String>();
    RowResolver groupByInputRowResolver = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
    RowResolver groupByOutputRowResolver = new RowResolver();
    groupByOutputRowResolver.setIsExprResolver(true);
    ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    for (int i = 0; i < grpByExprs.size(); ++i) {
        ASTNode grpbyExpr = grpByExprs.get(i);
        ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
        if (exprInfo == null) {
            throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
        }
        groupByKeys.add(new ExprNodeColumnDesc(exprInfo));
        String field = getColumnInternalName(i);
        outputColumnNames.add(field);
        ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
        groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
        addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo, groupByOutputRowResolver);
        colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
    }
    // This is only needed if a new grouping set key is being created
    int groupingSetsPosition = -1;
    // For grouping sets, add a dummy grouping key
    if (groupingSetsPresent) {
        groupingSetsPosition = groupByKeys.size();
        // This function is called for GroupBy2 to add grouping id as part of the groupby keys
        if (!groupingSetsNeedAdditionalMRJob) {
            addGroupingSetKey(groupByKeys, groupByInputRowResolver, groupByOutputRowResolver, outputColumnNames, colExprMap);
        } else {
            // The grouping set has not yet been processed. Create a new grouping key
            // Consider the query: select a,b, count(1) from T group by a,b with cube;
            // where it is being executed in 2 map-reduce jobs
            // The plan for 1st MR is TableScan -> GroupBy1 -> ReduceSink -> GroupBy2 -> FileSink
            // GroupBy1/ReduceSink worked as if grouping sets were not present
            // This function is called for GroupBy2 to create new rows for grouping sets
            // For each input row (a,b), 4 rows are created for the example above:
            // (a,b), (a,null), (null, b), (null, null)
            createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
        }
    }
    HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    // get the last colName for the reduce KEY
    // it represents the column name corresponding to distinct aggr, if any
    String lastKeyColName = null;
    List<ExprNodeDesc> reduceValues = null;
    if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) {
        List<String> inputKeyCols = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames();
        if (inputKeyCols.size() > 0) {
            lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
        }
        reduceValues = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getValueCols();
    }
    int numDistinctUDFs = 0;
    boolean containsDistinctAggr = false;
    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
        ASTNode value = entry.getValue();
        String aggName = unescapeIdentifier(value.getChild(0).getText());
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI);
        containsDistinctAggr = containsDistinctAggr || isDistinct;
        // side, so always look for the parameters: d+e
        if (isDistinct) {
            // 0 is the function name
            for (int i = 1; i < value.getChildCount(); i++) {
                ASTNode paraExpr = (ASTNode) value.getChild(i);
                ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
                if (paraExprInfo == null) {
                    throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(paraExpr));
                }
                String paraExpression = paraExprInfo.getInternalName();
                assert (paraExpression != null);
                if (isDistinct && lastKeyColName != null) {
                    // if aggr is distinct, the parameter is name is constructed as
                    // KEY.lastKeyColName:<tag>._colx
                    paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
                }
                ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
                ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
                if (reduceValue != null) {
                    // this parameter is a constant
                    expr = reduceValue;
                }
                aggParameters.add(expr);
            }
        } else {
            ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value);
            if (paraExprInfo == null) {
                throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(value));
            }
            String paraExpression = paraExprInfo.getInternalName();
            assert (paraExpression != null);
            aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
        }
        if (isDistinct) {
            numDistinctUDFs++;
        }
        Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
        GenericUDAFEvaluator genericUDAFEvaluator = null;
        genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
        assert (genericUDAFEvaluator != null);
        GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
        aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && isDistinct), amode));
        String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
        outputColumnNames.add(field);
        groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
    }
    float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
    float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
    // Nothing special needs to be done for grouping sets if
    // this is the final group by operator, and multiple rows corresponding to the
    // grouping sets have been generated upstream.
    // However, if an addition MR job has been created to handle grouping sets,
    // additional rows corresponding to grouping sets need to be created here.
    Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, groupByMemoryUsage, memoryThreshold, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), reduceSinkOperatorInfo), groupByOutputRowResolver);
    op.setColumnExprMap(colExprMap);
    return op;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap)

Example 2 with AggregationDesc

use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanMapGroupByOperator.

/**
   * Generate the map-side GroupByOperator for the Query Block
   * (qb.getParseInfo().getXXX(dest)). The new GroupByOperator will be a child
   * of the inputOperatorInfo.
   *
   * @param mode
   *          The mode of the aggregation (HASH)
   * @param genericUDAFEvaluators
   *          If not null, this function will store the mapping from Aggregation
   *          StringTree to the genericUDAFEvaluator in this parameter, so it
   *          can be used in the next-stage GroupBy aggregations.
   * @return the new GroupByOperator
   */
@SuppressWarnings("nls")
private Operator genGroupByPlanMapGroupByOperator(QB qb, String dest, List<ASTNode> grpByExprs, Operator inputOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Integer> groupingSetKeys, boolean groupingSetsPresent) throws SemanticException {
    RowResolver groupByInputRowResolver = opParseCtx.get(inputOperatorInfo).getRowResolver();
    QBParseInfo parseInfo = qb.getParseInfo();
    RowResolver groupByOutputRowResolver = new RowResolver();
    groupByOutputRowResolver.setIsExprResolver(true);
    ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
    ArrayList<String> outputColumnNames = new ArrayList<String>();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    for (int i = 0; i < grpByExprs.size(); ++i) {
        ASTNode grpbyExpr = grpByExprs.get(i);
        ExprNodeDesc grpByExprNode = genExprNodeDesc(grpbyExpr, groupByInputRowResolver);
        if ((grpByExprNode instanceof ExprNodeColumnDesc) && ExprNodeDescUtils.indexOf(grpByExprNode, groupByKeys) >= 0) {
            // Skip duplicated grouping keys, it happens when define column alias.
            grpByExprs.remove(i--);
            continue;
        }
        groupByKeys.add(grpByExprNode);
        String field = getColumnInternalName(i);
        outputColumnNames.add(field);
        groupByOutputRowResolver.putExpression(grpbyExpr, new ColumnInfo(field, grpByExprNode.getTypeInfo(), "", false));
        colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
    }
    // The grouping set key is present after the grouping keys, before the distinct keys
    int groupingSetsPosition = -1;
    // for the grouping set (corresponding to the rollup).
    if (groupingSetsPresent) {
        groupingSetsPosition = groupByKeys.size();
        createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
    }
    // If there is a distinctFuncExp, add all parameters to the reduceKeys.
    if (!parseInfo.getDistinctFuncExprsForClause(dest).isEmpty()) {
        List<ASTNode> list = parseInfo.getDistinctFuncExprsForClause(dest);
        for (ASTNode value : list) {
            // 0 is function name
            for (int i = 1; i < value.getChildCount(); i++) {
                ASTNode parameter = (ASTNode) value.getChild(i);
                if (groupByOutputRowResolver.getExpression(parameter) == null) {
                    ExprNodeDesc distExprNode = genExprNodeDesc(parameter, groupByInputRowResolver);
                    groupByKeys.add(distExprNode);
                    String field = getColumnInternalName(groupByKeys.size() - 1);
                    outputColumnNames.add(field);
                    groupByOutputRowResolver.putExpression(parameter, new ColumnInfo(field, distExprNode.getTypeInfo(), "", false));
                    colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
                }
            }
        }
    }
    // For each aggregation
    HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    assert (aggregationTrees != null);
    boolean containsDistinctAggr = false;
    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
        ASTNode value = entry.getValue();
        String aggName = unescapeIdentifier(value.getChild(0).getText());
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        // 0 is the function name
        for (int i = 1; i < value.getChildCount(); i++) {
            ASTNode paraExpr = (ASTNode) value.getChild(i);
            ExprNodeDesc paraExprNode = genExprNodeDesc(paraExpr, groupByInputRowResolver);
            aggParameters.add(paraExprNode);
        }
        boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
        containsDistinctAggr = containsDistinctAggr || isDistinct;
        boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
        Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
        GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, value, isDistinct, isAllColumns);
        assert (genericUDAFEvaluator != null);
        GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
        aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, isDistinct, amode));
        String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
        outputColumnNames.add(field);
        if (groupByOutputRowResolver.getExpression(value) == null) {
            groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
        }
        // GroupByOperators
        if (genericUDAFEvaluators != null) {
            genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator);
        }
    }
    float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
    float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
    Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, groupingSetKeys, groupingSetsPresent, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), inputOperatorInfo), groupByOutputRowResolver);
    op.setColumnExprMap(colExprMap);
    return op;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap)

Example 3 with AggregationDesc

use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.

the class RewriteQueryUsingAggregateIndexCtx method replaceGroupByOperatorProcess.

/**
   * We need to replace the count(indexed_column_key) GenericUDAF aggregation
   * function for group-by construct to "sum" GenericUDAF. This method creates a
   * new operator tree for a sample query that creates a GroupByOperator with
   * sum aggregation function and uses that GroupByOperator information to
   * replace the original GroupByOperator aggregation information. It replaces
   * the AggregationDesc (aggregation descriptor) of the old GroupByOperator
   * with the new Aggregation Desc of the new GroupByOperator.
   * @return
   */
private void replaceGroupByOperatorProcess(GroupByOperator operator, int index) throws SemanticException {
    RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;
    // We need to replace the GroupByOperator which is before RS
    if (index == 0) {
        // the query contains the sum aggregation GenericUDAF
        String selReplacementCommand = "select sum(`" + rewriteQueryCtx.getAggregateFunction() + "`)" + " from `" + rewriteQueryCtx.getIndexName() + "` group by " + rewriteQueryCtx.getIndexKey() + " ";
        // retrieve the operator tree for the query, and the required GroupByOperator from it
        Operator<?> newOperatorTree = RewriteParseContextGenerator.generateOperatorTree(rewriteQueryCtx.getParseContext().getQueryState(), selReplacementCommand);
        // we get our new GroupByOperator here
        GroupByOperator newGbyOperator = OperatorUtils.findLastOperatorUpstream(newOperatorTree, GroupByOperator.class);
        if (newGbyOperator == null) {
            throw new SemanticException("Error replacing GroupBy operator.");
        }
        // we need this information to set the correct colList, outputColumnNames
        // in SelectOperator
        ExprNodeColumnDesc aggrExprNode = null;
        // Construct the new AggregationDesc to get rid of the current
        // internal names and replace them with new internal names
        // as required by the operator tree
        GroupByDesc newConf = newGbyOperator.getConf();
        List<AggregationDesc> newAggrList = newConf.getAggregators();
        if (newAggrList != null && newAggrList.size() > 0) {
            for (AggregationDesc aggregationDesc : newAggrList) {
                rewriteQueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator());
                aggrExprNode = (ExprNodeColumnDesc) aggregationDesc.getParameters().get(0);
                rewriteQueryCtx.setAggrExprNode(aggrExprNode);
            }
        }
        // Now the GroupByOperator has the new AggregationList;
        // sum(`_count_of_indexed_key`)
        // instead of count(indexed_key)
        GroupByDesc oldConf = operator.getConf();
        oldConf.setAggregators((ArrayList<AggregationDesc>) newAggrList);
        operator.setConf(oldConf);
    } else {
        // we just need to reset the GenericUDAFEvaluator and its name for this
        // GroupByOperator whose parent is the ReduceSinkOperator
        GroupByDesc childConf = operator.getConf();
        List<AggregationDesc> childAggrList = childConf.getAggregators();
        if (childAggrList != null && childAggrList.size() > 0) {
            for (AggregationDesc aggregationDesc : childAggrList) {
                List<ExprNodeDesc> paraList = aggregationDesc.getParameters();
                List<ObjectInspector> parametersOIList = new ArrayList<ObjectInspector>();
                for (ExprNodeDesc expr : paraList) {
                    parametersOIList.add(expr.getWritableObjectInspector());
                }
                GenericUDAFEvaluator evaluator = FunctionRegistry.getGenericUDAFEvaluator("sum", parametersOIList, false, false);
                aggregationDesc.setGenericUDAFEvaluator(evaluator);
                aggregationDesc.setGenericUDAFName("sum");
            }
        }
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 4 with AggregationDesc

use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.

the class HiveGBOpConvUtil method genMapSideGB.

@SuppressWarnings("unchecked")
private static OpAttr genMapSideGB(OpAttr inputOpAf, GBInfo gbAttrs) throws SemanticException {
    ArrayList<String> outputColNames = new ArrayList<String>();
    ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    Set<String> gbKeyColsAsNamesFrmIn = new HashSet<String>();
    String colOutputName = null;
    // 1. Build GB Keys, grouping set starting position
    // 1.1 First Add original GB Keys
    ArrayList<ExprNodeDesc> gbKeys = new ArrayList<ExprNodeDesc>();
    for (int i = 0; i < gbAttrs.gbKeys.size(); i++) {
        gbKeys.add(gbAttrs.gbKeys.get(i));
        colOutputName = SemanticAnalyzer.getColumnInternalName(i);
        colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.gbKeyTypes.get(i), "", false));
        outputColNames.add(colOutputName);
        gbKeyColsAsNamesFrmIn.add(gbAttrs.gbKeyColNamesInInput.get(i));
        colExprMap.put(colOutputName, gbKeys.get(i));
    }
    // 1.2. Adjust GroupingSet Position, GBKeys for GroupingSet Position if
    // needed. NOTE: GroupingID is added to map side GB only if we don't GrpSet
    // doesn't require additional MR Jobs
    int groupingSetsPosition = -1;
    boolean inclGrpID = inclGrpSetInMapSide(gbAttrs);
    if (inclGrpID) {
        groupingSetsPosition = gbKeys.size();
        addGrpSetCol(true, null, false, gbKeys, outputColNames, colInfoLst, colExprMap);
    }
    // gen would have prevented it)
    for (int i = 0; i < gbAttrs.distExprNodes.size(); i++) {
        if (!gbKeyColsAsNamesFrmIn.contains(gbAttrs.distExprNames.get(i))) {
            gbKeys.add(gbAttrs.distExprNodes.get(i));
            colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() - 1);
            colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.distExprTypes.get(i), "", false));
            outputColNames.add(colOutputName);
            gbKeyColsAsNamesFrmIn.add(gbAttrs.distExprNames.get(i));
            colExprMap.put(colOutputName, gbKeys.get(gbKeys.size() - 1));
        }
    }
    // 2. Build Aggregations
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    for (UDAFAttrs udafAttr : gbAttrs.udafAttrs) {
        Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, udafAttr.isDistinctUDAF);
        aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udafAttr.udafEvaluator, udafAttr.udafParams, udafAttr.isDistinctUDAF, amode));
        GenericUDAFInfo udafInfo;
        try {
            udafInfo = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, amode, udafAttr.udafParams);
        } catch (SemanticException e) {
            throw new RuntimeException(e);
        }
        colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1);
        colInfoLst.add(new ColumnInfo(colOutputName, udafInfo.returnType, "", false));
        outputColNames.add(colOutputName);
    }
    // 3. Create GB
    @SuppressWarnings("rawtypes") Operator gbOp = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.HASH, outputColNames, gbKeys, aggregations, false, gbAttrs.groupByMemoryUsage, gbAttrs.memoryThreshold, gbAttrs.grpSets, inclGrpID, groupingSetsPosition, gbAttrs.containsDistinctAggr), new RowSchema(colInfoLst), inputOpAf.inputs.get(0));
    // 5. Setup Expr Col Map
    // NOTE: UDAF is not included in ExprColMap
    gbOp.setColumnExprMap(colExprMap);
    return new OpAttr("", new HashSet<Integer>(), gbOp);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) GenericUDAFInfo(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) OpAttr(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc)

Example 5 with AggregationDesc

use of org.apache.hadoop.hive.ql.plan.AggregationDesc in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanGroupByOperator2MR.

/**
 * Generate the second GroupByOperator for the Group By Plan
 * (parseInfo.getXXX(dest)). The new GroupByOperator will do the second
 * aggregation based on the partial aggregation results.
 *
 * @param mode
 *          the mode of aggregation (FINAL)
 * @param genericUDAFEvaluators
 *          The mapping from Aggregation StringTree to the
 *          genericUDAFEvaluator.
 * @return the new GroupByOperator
 * @throws SemanticException
 */
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator2MR(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo2, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, boolean groupingSetsPresent) throws SemanticException {
    RowResolver groupByInputRowResolver2 = opParseCtx.get(reduceSinkOperatorInfo2).getRowResolver();
    RowResolver groupByOutputRowResolver2 = new RowResolver();
    groupByOutputRowResolver2.setIsExprResolver(true);
    ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
    ArrayList<String> outputColumnNames = new ArrayList<String>();
    for (int i = 0; i < grpByExprs.size(); ++i) {
        ASTNode grpbyExpr = grpByExprs.get(i);
        ColumnInfo exprInfo = groupByInputRowResolver2.getExpression(grpbyExpr);
        if (exprInfo == null) {
            throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
        }
        String expression = exprInfo.getInternalName();
        groupByKeys.add(new ExprNodeColumnDesc(exprInfo.getType(), expression, exprInfo.getTabAlias(), exprInfo.getIsVirtualCol()));
        String field = getColumnInternalName(i);
        outputColumnNames.add(field);
        ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
        groupByOutputRowResolver2.putExpression(grpbyExpr, oColInfo);
        addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo2, groupByOutputRowResolver2);
        colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
    }
    int groupingSetsPosition = -1;
    // For grouping sets, add a dummy grouping key
    if (groupingSetsPresent) {
        groupingSetsPosition = groupByKeys.size();
        addGroupingSetKey(groupByKeys, groupByInputRowResolver2, groupByOutputRowResolver2, outputColumnNames, colExprMap);
    }
    HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    boolean containsDistinctAggr = false;
    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        ASTNode value = entry.getValue();
        ColumnInfo paraExprInfo = groupByInputRowResolver2.getExpression(value);
        if (paraExprInfo == null) {
            throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(value));
        }
        String paraExpression = paraExprInfo.getInternalName();
        assert (paraExpression != null);
        aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
        String aggName = unescapeIdentifier(value.getChild(0).getText());
        boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
        containsDistinctAggr = containsDistinctAggr || isDistinct;
        boolean isStar = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
        Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
        GenericUDAFEvaluator genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
        assert (genericUDAFEvaluator != null);
        GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
        aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && value.getToken().getType() == HiveParser.TOK_FUNCTIONDI), amode));
        String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
        outputColumnNames.add(field);
        groupByOutputRowResolver2.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
    }
    float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
    float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
    Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, null, false, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver2.getColumnInfos()), reduceSinkOperatorInfo2), groupByOutputRowResolver2);
    op.setColumnExprMap(colExprMap);
    return op;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap)

Aggregations

AggregationDesc (org.apache.hadoop.hive.ql.plan.AggregationDesc)40 ArrayList (java.util.ArrayList)36 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)36 GroupByDesc (org.apache.hadoop.hive.ql.plan.GroupByDesc)33 HashMap (java.util.HashMap)26 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)25 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)23 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)23 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)23 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)21 Operator (org.apache.hadoop.hive.ql.exec.Operator)19 Mode (org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode)16 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)14 GenericUDAFEvaluator (org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator)14 Map (java.util.Map)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 LinkedHashMap (java.util.LinkedHashMap)11 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)11 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)11 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)11