Search in sources :

Example 16 with GroupByOperator

use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanMapAggr2MR.

/**
 * Generate a Group-By plan using a 2 map-reduce jobs. However, only 1
 * group-by plan is generated if the query involves no grouping key and no
 * distincts. In that case, the plan is same as generated by
 * genGroupByPlanMapAggr1MR. Otherwise, the following plan is generated: First
 * perform a map side partial aggregation (to reduce the amount of data). Then
 * spray by the grouping key and distinct key (or a random number, if no
 * distinct is present) in hope of getting a uniform distribution, and compute
 * partial aggregates grouped by the reduction key (grouping key + distinct
 * key). Evaluate partial aggregates first, and spray by the grouping key to
 * compute actual aggregates in the second phase.
 *
 * The aggregation evaluation functions are as follows:
 *
 * No grouping sets:
 * STAGE 1
 * Group by Operator:
 * grouping keys: group by expressions if no DISTINCT
 * grouping keys: group by expressions + distinct keys if DISTINCT
 * Mapper: iterate/terminatePartial (mode = HASH)
 * Partitioning Key: random() if no DISTINCT
 * grouping + distinct key if DISTINCT
 * Sorting Key: grouping key if no DISTINCT
 * grouping + distinct key if DISTINCT
 * Reducer: iterate/terminatePartial if DISTINCT
 * merge/terminatePartial if NO DISTINCT (mode = MERGEPARTIAL)
 * Group by Operator:
 * grouping keys: group by expressions
 *
 * STAGE 2
 * Partitioning Key: grouping key
 * Sorting Key: grouping key
 * Reducer: merge/terminate (mode = FINAL)
 *
 * In the presence of grouping sets, the aggregation evaluation functions are as follows:
 * STAGE 1
 * Group by Operator:
 * grouping keys: group by expressions + grouping id. if no DISTINCT
 * grouping keys: group by expressions + + grouping id. + distinct keys if DISTINCT
 * Mapper: iterate/terminatePartial (mode = HASH)
 * Partitioning Key: random() if no DISTINCT
 * grouping + grouping id. + distinct key if DISTINCT
 * Sorting Key: grouping key + grouping id. if no DISTINCT
 * grouping + grouping id. + distinct key if DISTINCT
 * Reducer: iterate/terminatePartial if DISTINCT
 * merge/terminatePartial if NO DISTINCT (mode = MERGEPARTIAL)
 * Group by Operator:
 * grouping keys: group by expressions + grouping id.
 *
 * STAGE 2
 * Partitioning Key: grouping key
 * Sorting Key: grouping key + grouping id.
 * Reducer: merge/terminate (mode = FINAL)
 */
@SuppressWarnings("nls")
private Operator genGroupByPlanMapAggr2MR(String dest, QB qb, Operator inputOperatorInfo) throws SemanticException {
    QBParseInfo parseInfo = qb.getParseInfo();
    ObjectPair<List<ASTNode>, List<Long>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
    List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
    List<Long> groupingSets = grpByExprsGroupingSets.getSecond();
    boolean groupingSetsPresent = !groupingSets.isEmpty();
    if (groupingSetsPresent) {
        checkExpressionsForGroupingSet(grpByExprs, parseInfo.getDistinctFuncExprsForClause(dest), parseInfo.getAggregationExprsForClause(dest), opParseCtx.get(inputOperatorInfo).getRowResolver());
        int newMRJobGroupingSetsThreshold = conf.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
        // Turn off skew if an additional MR job is required anyway for grouping sets.
        if (groupingSets.size() > newMRJobGroupingSetsThreshold) {
            String errorMsg = "The number of rows per input row due to grouping sets is " + groupingSets.size();
            throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg));
        }
    }
    // ////// Generate GroupbyOperator for a map-side partial aggregation
    Map<String, GenericUDAFEvaluator> genericUDAFEvaluators = new LinkedHashMap<String, GenericUDAFEvaluator>();
    GroupByOperator groupByOperatorInfo = (GroupByOperator) genGroupByPlanMapGroupByOperator(qb, dest, grpByExprs, inputOperatorInfo, GroupByDesc.Mode.HASH, genericUDAFEvaluators, groupingSets, groupingSetsPresent);
    groupOpToInputTables.put(groupByOperatorInfo, opParseCtx.get(inputOperatorInfo).getRowResolver().getTableNames());
    // For eg: select count(1) from T where t.ds = ....
    if (!optimizeMapAggrGroupBy(dest, qb)) {
        List<ASTNode> distinctFuncExprs = parseInfo.getDistinctFuncExprsForClause(dest);
        // ////// Generate ReduceSink Operator
        Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, distinctFuncExprs.isEmpty() ? -1 : Integer.MAX_VALUE, false, -1, true, groupingSetsPresent);
        // ////// Generate GroupbyOperator for a partial aggregation
        Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, genericUDAFEvaluators, groupingSets, groupingSetsPresent, false);
        int numReducers = -1;
        if (grpByExprs.isEmpty()) {
            numReducers = 1;
        }
        // ////// Generate ReduceSinkOperator2
        Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR(parseInfo, dest, groupByOperatorInfo2, grpByExprs.size(), numReducers, groupingSetsPresent);
        // ////// Generate GroupbyOperator3
        return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, groupingSetsPresent);
    } else {
        // If there are no grouping keys, grouping sets cannot be present
        assert !groupingSetsPresent;
        // ////// Generate ReduceSink Operator
        Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, grpByExprs.size(), false, 1, true, groupingSetsPresent);
        return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, false);
    }
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) LinkedHashMap(java.util.LinkedHashMap) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 17 with GroupByOperator

use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlan1ReduceMultiGBY.

@SuppressWarnings({ "nls" })
private Operator genGroupByPlan1ReduceMultiGBY(List<String> dests, QB qb, Operator input, Map<String, Operator> aliasToOpInfo) throws SemanticException {
    QBParseInfo parseInfo = qb.getParseInfo();
    ExprNodeDesc previous = null;
    Operator selectInput = input;
    // In order to facilitate partition pruning, or the where clauses together and put them at the
    // top of the operator tree, this could also reduce the amount of data going to the reducer
    List<ExprNodeDesc.ExprNodeDescEqualityWrapper> whereExpressions = new ArrayList<ExprNodeDesc.ExprNodeDescEqualityWrapper>();
    for (String dest : dests) {
        ObjectPair<List<ASTNode>, List<Long>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
        List<Long> groupingSets = grpByExprsGroupingSets.getSecond();
        if (!groupingSets.isEmpty()) {
            throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_AGGR_NOMAPAGGR_MULTIGBY.getMsg());
        }
        ASTNode whereExpr = parseInfo.getWhrForClause(dest);
        if (whereExpr != null) {
            OpParseContext inputCtx = opParseCtx.get(input);
            RowResolver inputRR = inputCtx.getRowResolver();
            ExprNodeDesc current = genExprNodeDesc((ASTNode) whereExpr.getChild(0), inputRR);
            // Check the list of where expressions already added so they aren't duplicated
            ExprNodeDesc.ExprNodeDescEqualityWrapper currentWrapped = new ExprNodeDesc.ExprNodeDescEqualityWrapper(current);
            if (!whereExpressions.contains(currentWrapped)) {
                whereExpressions.add(currentWrapped);
            } else {
                continue;
            }
            if (previous == null) {
                // If this is the first expression
                previous = current;
                continue;
            }
            GenericUDFOPOr or = new GenericUDFOPOr();
            List<ExprNodeDesc> expressions = new ArrayList<ExprNodeDesc>(2);
            expressions.add(current);
            expressions.add(previous);
            ExprNodeDesc orExpr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, or, expressions);
            previous = orExpr;
        } else {
            // If an expression does not have a where clause, there can be no common filter
            previous = null;
            break;
        }
    }
    if (previous != null) {
        OpParseContext inputCtx = opParseCtx.get(input);
        RowResolver inputRR = inputCtx.getRowResolver();
        FilterDesc orFilterDesc = new FilterDesc(previous, false);
        orFilterDesc.setGenerated(true);
        selectInput = putOpInsertMap(OperatorFactory.getAndMakeChild(orFilterDesc, new RowSchema(inputRR.getColumnInfos()), input), inputRR);
    }
    // insert a select operator here used by the ColumnPruner to reduce
    // the data to shuffle
    Operator select = genSelectAllDesc(selectInput);
    // Generate ReduceSinkOperator
    ReduceSinkOperator reduceSinkOperatorInfo = genCommonGroupByPlanReduceSinkOperator(qb, dests, select);
    // It is assumed throughout the code that a reducer has a single child, add a
    // ForwardOperator so that we can add multiple filter/group by operators as children
    RowResolver reduceSinkOperatorInfoRR = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
    Operator forwardOp = putOpInsertMap(OperatorFactory.getAndMakeChild(new ForwardDesc(), new RowSchema(reduceSinkOperatorInfoRR.getColumnInfos()), reduceSinkOperatorInfo), reduceSinkOperatorInfoRR);
    Operator curr = forwardOp;
    for (String dest : dests) {
        curr = forwardOp;
        if (parseInfo.getWhrForClause(dest) != null) {
            ASTNode whereExpr = qb.getParseInfo().getWhrForClause(dest);
            curr = genFilterPlan((ASTNode) whereExpr.getChild(0), qb, forwardOp, aliasToOpInfo, false, true);
        }
        // Generate GroupbyOperator
        Operator groupByOperatorInfo = genGroupByPlanGroupByOperator(parseInfo, dest, curr, reduceSinkOperatorInfo, GroupByDesc.Mode.COMPLETE, null);
        // TODO: should we pass curr instead of null?
        curr = genPostGroupByBodyPlan(groupByOperatorInfo, dest, qb, aliasToOpInfo, null);
    }
    return curr;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ArrayList(java.util.ArrayList) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) ForwardDesc(org.apache.hadoop.hive.ql.plan.ForwardDesc) LateralViewForwardDesc(org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GenericUDFOPOr(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 18 with GroupByOperator

use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanMapAggrNoSkew.

/**
 * Generate a Group-By plan using 1 map-reduce job. First perform a map-side
 * partial aggregation (to reduce the amount of data), at this point of time,
 * we may turn off map-side partial aggregation based on its performance. Then
 * spray by the group by key, and sort by the distinct key (if any), and
 * compute aggregates based on actual aggregates
 *
 * The aggregation evaluation functions are as follows:
 *
 * No grouping sets:
 * Group By Operator:
 * grouping keys: group by expressions if no DISTINCT
 * grouping keys: group by expressions + distinct keys if DISTINCT
 * Mapper: iterate/terminatePartial (mode = HASH)
 * Partitioning Key: grouping key
 * Sorting Key: grouping key if no DISTINCT
 * grouping + distinct key if DISTINCT
 * Reducer: iterate/terminate if DISTINCT
 * merge/terminate if NO DISTINCT (mode MERGEPARTIAL)
 *
 * Grouping Sets:
 * Group By Operator:
 * grouping keys: group by expressions + grouping id. if no DISTINCT
 * grouping keys: group by expressions + grouping id. + distinct keys if DISTINCT
 * Mapper: iterate/terminatePartial (mode = HASH)
 * Partitioning Key: grouping key + grouping id.
 * Sorting Key: grouping key + grouping id. if no DISTINCT
 * grouping + grouping id. + distinct key if DISTINCT
 * Reducer: iterate/terminate if DISTINCT
 * merge/terminate if NO DISTINCT (mode MERGEPARTIAL)
 *
 * Grouping Sets with an additional MR job introduced (distincts are not allowed):
 * Group By Operator:
 * grouping keys: group by expressions
 * Mapper: iterate/terminatePartial (mode = HASH)
 * Partitioning Key: grouping key
 * Sorting Key: grouping key
 * Reducer: merge/terminate (mode MERGEPARTIAL)
 * Group by Operator:
 * grouping keys: group by expressions + add a new grouping id. key
 *
 * STAGE 2
 * Partitioning Key: grouping key + grouping id.
 * Sorting Key: grouping key + grouping id.
 * Reducer: merge/terminate (mode = FINAL)
 * Group by Operator:
 * grouping keys: group by expressions + grouping id.
 */
@SuppressWarnings("nls")
private Operator genGroupByPlanMapAggrNoSkew(String dest, QB qb, Operator inputOperatorInfo) throws SemanticException {
    QBParseInfo parseInfo = qb.getParseInfo();
    ObjectPair<List<ASTNode>, List<Long>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
    List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
    List<Long> groupingSets = grpByExprsGroupingSets.getSecond();
    boolean groupingSetsPresent = !groupingSets.isEmpty();
    int newMRJobGroupingSetsThreshold = conf.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
    if (groupingSetsPresent) {
        checkExpressionsForGroupingSet(grpByExprs, parseInfo.getDistinctFuncExprsForClause(dest), parseInfo.getAggregationExprsForClause(dest), opParseCtx.get(inputOperatorInfo).getRowResolver());
    }
    // ////// Generate GroupbyOperator for a map-side partial aggregation
    Map<String, GenericUDAFEvaluator> genericUDAFEvaluators = new LinkedHashMap<String, GenericUDAFEvaluator>();
    // Is the grouping sets data consumed in the current in MR job, or
    // does it need an additional MR job
    boolean groupingSetsNeedAdditionalMRJob = groupingSetsPresent && groupingSets.size() > newMRJobGroupingSetsThreshold ? true : false;
    GroupByOperator groupByOperatorInfo = (GroupByOperator) genGroupByPlanMapGroupByOperator(qb, dest, grpByExprs, inputOperatorInfo, GroupByDesc.Mode.HASH, genericUDAFEvaluators, groupingSets, groupingSetsPresent && !groupingSetsNeedAdditionalMRJob);
    groupOpToInputTables.put(groupByOperatorInfo, opParseCtx.get(inputOperatorInfo).getRowResolver().getTableNames());
    int numReducers = -1;
    // needed
    if (grpByExprs.isEmpty()) {
        numReducers = 1;
    }
    // ////// Generate ReduceSink Operator
    boolean isDistinct = !qb.getParseInfo().getDistinctFuncExprsForClause(dest).isEmpty();
    // Distincts are not allowed with an additional mr job
    if (groupingSetsNeedAdditionalMRJob && isDistinct) {
        String errorMsg = "The number of rows per input row due to grouping sets is " + groupingSets.size();
        throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS.getMsg(errorMsg));
    }
    Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, grpByExprs.size(), true, numReducers, true, groupingSetsPresent && !groupingSetsNeedAdditionalMRJob);
    // Does it require a new MR job for grouping sets
    if (!groupingSetsPresent || !groupingSetsNeedAdditionalMRJob) {
        // on the reducer.
        return genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.MERGEPARTIAL, genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob);
    } else {
        // Add 'n' rows corresponding to the grouping sets. For each row, create 'n' rows,
        // one for each grouping set key. Since map-side aggregation has already been performed,
        // the number of rows would have been reduced. Moreover, the rows corresponding to the
        // grouping keys come together, so there is a higher chance of finding the rows in the hash
        // table.
        Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob);
        // ////// Generate ReduceSinkOperator2
        Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR(parseInfo, dest, groupByOperatorInfo2, grpByExprs.size() + 1, numReducers, groupingSetsPresent);
        // ////// Generate GroupbyOperator3
        return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, groupingSetsPresent);
    }
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) LinkedHashMap(java.util.LinkedHashMap) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 19 with GroupByOperator

use of org.apache.hadoop.hive.ql.exec.GroupByOperator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanGroupByOperator1.

/**
 * Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
 * The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
 *
 * @param parseInfo
 * @param dest
 * @param reduceSinkOperatorInfo
 * @param mode
 *          The mode of the aggregation (MERGEPARTIAL, PARTIAL2)
 * @param genericUDAFEvaluators
 *          The mapping from Aggregation StringTree to the
 *          genericUDAFEvaluator.
 * @param groupingSets
 *          list of grouping sets
 * @param groupingSetsPresent
 *          whether grouping sets are present in this query
 * @param groupingSetsNeedAdditionalMRJob
 *          whether grouping sets are consumed by this group by
 * @return the new GroupByOperator
 */
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Long> groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException {
    ArrayList<String> outputColumnNames = new ArrayList<String>();
    RowResolver groupByInputRowResolver = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
    RowResolver groupByOutputRowResolver = new RowResolver();
    groupByOutputRowResolver.setIsExprResolver(true);
    ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    for (int i = 0; i < grpByExprs.size(); ++i) {
        ASTNode grpbyExpr = grpByExprs.get(i);
        ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
        if (exprInfo == null) {
            throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
        }
        groupByKeys.add(new ExprNodeColumnDesc(exprInfo));
        String field = getColumnInternalName(i);
        outputColumnNames.add(field);
        ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
        groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
        addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo, groupByOutputRowResolver);
        colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
    }
    // This is only needed if a new grouping set key is being created
    int groupingSetsPosition = -1;
    // For grouping sets, add a dummy grouping key
    if (groupingSetsPresent) {
        groupingSetsPosition = groupByKeys.size();
        // This function is called for GroupBy2 to add grouping id as part of the groupby keys
        if (!groupingSetsNeedAdditionalMRJob) {
            addGroupingSetKey(groupByKeys, groupByInputRowResolver, groupByOutputRowResolver, outputColumnNames, colExprMap);
        } else {
            // The grouping set has not yet been processed. Create a new grouping key
            // Consider the query: select a,b, count(1) from T group by a,b with cube;
            // where it is being executed in 2 map-reduce jobs
            // The plan for 1st MR is TableScan -> GroupBy1 -> ReduceSink -> GroupBy2 -> FileSink
            // GroupBy1/ReduceSink worked as if grouping sets were not present
            // This function is called for GroupBy2 to create new rows for grouping sets
            // For each input row (a,b), 4 rows are created for the example above:
            // (a,b), (a,null), (null, b), (null, null)
            createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
        }
    }
    HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    // get the last colName for the reduce KEY
    // it represents the column name corresponding to distinct aggr, if any
    String lastKeyColName = null;
    List<ExprNodeDesc> reduceValues = null;
    if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) {
        List<String> inputKeyCols = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames();
        if (inputKeyCols.size() > 0) {
            lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
        }
        reduceValues = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getValueCols();
    }
    int numDistinctUDFs = 0;
    boolean containsDistinctAggr = false;
    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
        ASTNode value = entry.getValue();
        String aggName = unescapeIdentifier(value.getChild(0).getText());
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI);
        containsDistinctAggr = containsDistinctAggr || isDistinct;
        // side, so always look for the parameters: d+e
        if (isDistinct) {
            // 0 is the function name
            for (int i = 1; i < value.getChildCount(); i++) {
                ASTNode paraExpr = (ASTNode) value.getChild(i);
                ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
                if (paraExprInfo == null) {
                    throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(paraExpr));
                }
                String paraExpression = paraExprInfo.getInternalName();
                assert (paraExpression != null);
                if (isDistinct && lastKeyColName != null) {
                    // if aggr is distinct, the parameter is name is constructed as
                    // KEY.lastKeyColName:<tag>._colx
                    paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
                }
                ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
                ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
                if (reduceValue != null) {
                    // this parameter is a constant
                    expr = reduceValue;
                }
                aggParameters.add(expr);
            }
        } else {
            ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value);
            if (paraExprInfo == null) {
                throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(value));
            }
            String paraExpression = paraExprInfo.getInternalName();
            assert (paraExpression != null);
            aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
        }
        if (isDistinct) {
            numDistinctUDFs++;
        }
        Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
        GenericUDAFEvaluator genericUDAFEvaluator = null;
        genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
        assert (genericUDAFEvaluator != null);
        GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
        aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && isDistinct), amode));
        String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
        outputColumnNames.add(field);
        groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
    }
    float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
    float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
    // Nothing special needs to be done for grouping sets if
    // this is the final group by operator, and multiple rows corresponding to the
    // grouping sets have been generated upstream.
    // However, if an addition MR job has been created to handle grouping sets,
    // additional rows corresponding to grouping sets need to be created here.
    Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, groupByMemoryUsage, memoryThreshold, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), reduceSinkOperatorInfo), groupByOutputRowResolver);
    op.setColumnExprMap(colExprMap);
    return op;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap)

Aggregations

GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)19 ArrayList (java.util.ArrayList)18 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)16 Operator (org.apache.hadoop.hive.ql.exec.Operator)13 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)13 HashMap (java.util.HashMap)12 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)12 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)11 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)11 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)11 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)11 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)11 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)11 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)10 LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)10 AggregationDesc (org.apache.hadoop.hive.ql.plan.AggregationDesc)10 GroupByDesc (org.apache.hadoop.hive.ql.plan.GroupByDesc)10 GenericUDAFEvaluator (org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator)10