Search in sources :

Example 21 with GenericUDAFEvaluator

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanMapAggrNoSkew.

/**
   * Generate a Group-By plan using 1 map-reduce job. First perform a map-side
   * partial aggregation (to reduce the amount of data), at this point of time,
   * we may turn off map-side partial aggregation based on its performance. Then
   * spray by the group by key, and sort by the distinct key (if any), and
   * compute aggregates based on actual aggregates
   *
   * The aggregation evaluation functions are as follows:
   *
   * No grouping sets:
   * Group By Operator:
   * grouping keys: group by expressions if no DISTINCT
   * grouping keys: group by expressions + distinct keys if DISTINCT
   * Mapper: iterate/terminatePartial (mode = HASH)
   * Partitioning Key: grouping key
   * Sorting Key: grouping key if no DISTINCT
   * grouping + distinct key if DISTINCT
   * Reducer: iterate/terminate if DISTINCT
   * merge/terminate if NO DISTINCT (mode MERGEPARTIAL)
   *
   * Grouping Sets:
   * Group By Operator:
   * grouping keys: group by expressions + grouping id. if no DISTINCT
   * grouping keys: group by expressions + grouping id. + distinct keys if DISTINCT
   * Mapper: iterate/terminatePartial (mode = HASH)
   * Partitioning Key: grouping key + grouping id.
   * Sorting Key: grouping key + grouping id. if no DISTINCT
   * grouping + grouping id. + distinct key if DISTINCT
   * Reducer: iterate/terminate if DISTINCT
   * merge/terminate if NO DISTINCT (mode MERGEPARTIAL)
   *
   * Grouping Sets with an additional MR job introduced (distincts are not allowed):
   * Group By Operator:
   * grouping keys: group by expressions
   * Mapper: iterate/terminatePartial (mode = HASH)
   * Partitioning Key: grouping key
   * Sorting Key: grouping key
   * Reducer: merge/terminate (mode MERGEPARTIAL)
   * Group by Operator:
   * grouping keys: group by expressions + add a new grouping id. key
   *
   * STAGE 2
   * Partitioning Key: grouping key + grouping id.
   * Sorting Key: grouping key + grouping id.
   * Reducer: merge/terminate (mode = FINAL)
   * Group by Operator:
   * grouping keys: group by expressions + grouping id.
   */
@SuppressWarnings("nls")
private Operator genGroupByPlanMapAggrNoSkew(String dest, QB qb, Operator inputOperatorInfo) throws SemanticException {
    QBParseInfo parseInfo = qb.getParseInfo();
    ObjectPair<List<ASTNode>, List<Integer>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
    List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
    List<Integer> groupingSets = grpByExprsGroupingSets.getSecond();
    boolean groupingSetsPresent = !groupingSets.isEmpty();
    int newMRJobGroupingSetsThreshold = conf.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
    if (groupingSetsPresent) {
        checkExpressionsForGroupingSet(grpByExprs, parseInfo.getDistinctFuncExprsForClause(dest), parseInfo.getAggregationExprsForClause(dest), opParseCtx.get(inputOperatorInfo).getRowResolver());
    }
    // ////// Generate GroupbyOperator for a map-side partial aggregation
    Map<String, GenericUDAFEvaluator> genericUDAFEvaluators = new LinkedHashMap<String, GenericUDAFEvaluator>();
    // Is the grouping sets data consumed in the current in MR job, or
    // does it need an additional MR job
    boolean groupingSetsNeedAdditionalMRJob = groupingSetsPresent && groupingSets.size() > newMRJobGroupingSetsThreshold ? true : false;
    GroupByOperator groupByOperatorInfo = (GroupByOperator) genGroupByPlanMapGroupByOperator(qb, dest, grpByExprs, inputOperatorInfo, GroupByDesc.Mode.HASH, genericUDAFEvaluators, groupingSets, groupingSetsPresent && !groupingSetsNeedAdditionalMRJob);
    groupOpToInputTables.put(groupByOperatorInfo, opParseCtx.get(inputOperatorInfo).getRowResolver().getTableNames());
    int numReducers = -1;
    // needed
    if (grpByExprs.isEmpty()) {
        numReducers = 1;
    }
    // ////// Generate ReduceSink Operator
    boolean isDistinct = !qb.getParseInfo().getDistinctFuncExprsForClause(dest).isEmpty();
    // Distincts are not allowed with an additional mr job
    if (groupingSetsNeedAdditionalMRJob && isDistinct) {
        String errorMsg = "The number of rows per input row due to grouping sets is " + groupingSets.size();
        throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS.getMsg(errorMsg));
    }
    Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, grpByExprs.size(), true, numReducers, true, groupingSetsPresent && !groupingSetsNeedAdditionalMRJob);
    // Does it require a new MR job for grouping sets
    if (!groupingSetsPresent || !groupingSetsNeedAdditionalMRJob) {
        // on the reducer.
        return genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.MERGEPARTIAL, genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob);
    } else {
        // Add 'n' rows corresponding to the grouping sets. For each row, create 'n' rows,
        // one for each grouping set key. Since map-side aggregation has already been performed,
        // the number of rows would have been reduced. Moreover, the rows corresponding to the
        // grouping keys come together, so there is a higher chance of finding the rows in the hash
        // table.
        Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, genericUDAFEvaluators, groupingSets, groupingSetsPresent, groupingSetsNeedAdditionalMRJob);
        // ////// Generate ReduceSinkOperator2
        Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR(parseInfo, dest, groupByOperatorInfo2, grpByExprs.size() + 1, numReducers, groupingSetsPresent);
        // ////// Generate GroupbyOperator3
        return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, groupingSetsPresent);
    }
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) LinkedHashMap(java.util.LinkedHashMap) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 22 with GenericUDAFEvaluator

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanMapAggr2MR.

/**
   * Generate a Group-By plan using a 2 map-reduce jobs. However, only 1
   * group-by plan is generated if the query involves no grouping key and no
   * distincts. In that case, the plan is same as generated by
   * genGroupByPlanMapAggr1MR. Otherwise, the following plan is generated: First
   * perform a map side partial aggregation (to reduce the amount of data). Then
   * spray by the grouping key and distinct key (or a random number, if no
   * distinct is present) in hope of getting a uniform distribution, and compute
   * partial aggregates grouped by the reduction key (grouping key + distinct
   * key). Evaluate partial aggregates first, and spray by the grouping key to
   * compute actual aggregates in the second phase.
   *
   * The aggregation evaluation functions are as follows:
   *
   * No grouping sets:
   * STAGE 1
   * Group by Operator:
   * grouping keys: group by expressions if no DISTINCT
   * grouping keys: group by expressions + distinct keys if DISTINCT
   * Mapper: iterate/terminatePartial (mode = HASH)
   * Partitioning Key: random() if no DISTINCT
   * grouping + distinct key if DISTINCT
   * Sorting Key: grouping key if no DISTINCT
   * grouping + distinct key if DISTINCT
   * Reducer: iterate/terminatePartial if DISTINCT
   * merge/terminatePartial if NO DISTINCT (mode = MERGEPARTIAL)
   * Group by Operator:
   * grouping keys: group by expressions
   *
   * STAGE 2
   * Partitioning Key: grouping key
   * Sorting Key: grouping key
   * Reducer: merge/terminate (mode = FINAL)
   *
   * In the presence of grouping sets, the aggregation evaluation functions are as follows:
   * STAGE 1
   * Group by Operator:
   * grouping keys: group by expressions + grouping id. if no DISTINCT
   * grouping keys: group by expressions + + grouping id. + distinct keys if DISTINCT
   * Mapper: iterate/terminatePartial (mode = HASH)
   * Partitioning Key: random() if no DISTINCT
   * grouping + grouping id. + distinct key if DISTINCT
   * Sorting Key: grouping key + grouping id. if no DISTINCT
   * grouping + grouping id. + distinct key if DISTINCT
   * Reducer: iterate/terminatePartial if DISTINCT
   * merge/terminatePartial if NO DISTINCT (mode = MERGEPARTIAL)
   * Group by Operator:
   * grouping keys: group by expressions + grouping id.
   *
   * STAGE 2
   * Partitioning Key: grouping key
   * Sorting Key: grouping key + grouping id.
   * Reducer: merge/terminate (mode = FINAL)
   */
@SuppressWarnings("nls")
private Operator genGroupByPlanMapAggr2MR(String dest, QB qb, Operator inputOperatorInfo) throws SemanticException {
    QBParseInfo parseInfo = qb.getParseInfo();
    ObjectPair<List<ASTNode>, List<Integer>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
    List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
    List<Integer> groupingSets = grpByExprsGroupingSets.getSecond();
    boolean groupingSetsPresent = !groupingSets.isEmpty();
    if (groupingSetsPresent) {
        checkExpressionsForGroupingSet(grpByExprs, parseInfo.getDistinctFuncExprsForClause(dest), parseInfo.getAggregationExprsForClause(dest), opParseCtx.get(inputOperatorInfo).getRowResolver());
        int newMRJobGroupingSetsThreshold = conf.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
        // Turn off skew if an additional MR job is required anyway for grouping sets.
        if (groupingSets.size() > newMRJobGroupingSetsThreshold) {
            String errorMsg = "The number of rows per input row due to grouping sets is " + groupingSets.size();
            throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg));
        }
    }
    // ////// Generate GroupbyOperator for a map-side partial aggregation
    Map<String, GenericUDAFEvaluator> genericUDAFEvaluators = new LinkedHashMap<String, GenericUDAFEvaluator>();
    GroupByOperator groupByOperatorInfo = (GroupByOperator) genGroupByPlanMapGroupByOperator(qb, dest, grpByExprs, inputOperatorInfo, GroupByDesc.Mode.HASH, genericUDAFEvaluators, groupingSets, groupingSetsPresent);
    groupOpToInputTables.put(groupByOperatorInfo, opParseCtx.get(inputOperatorInfo).getRowResolver().getTableNames());
    // For eg: select count(1) from T where t.ds = ....
    if (!optimizeMapAggrGroupBy(dest, qb)) {
        List<ASTNode> distinctFuncExprs = parseInfo.getDistinctFuncExprsForClause(dest);
        // ////// Generate ReduceSink Operator
        Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, distinctFuncExprs.isEmpty() ? -1 : Integer.MAX_VALUE, false, -1, true, groupingSetsPresent);
        // ////// Generate GroupbyOperator for a partial aggregation
        Operator groupByOperatorInfo2 = genGroupByPlanGroupByOperator1(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.PARTIALS, genericUDAFEvaluators, groupingSets, groupingSetsPresent, false);
        int numReducers = -1;
        if (grpByExprs.isEmpty()) {
            numReducers = 1;
        }
        // ////// Generate ReduceSinkOperator2
        Operator reduceSinkOperatorInfo2 = genGroupByPlanReduceSinkOperator2MR(parseInfo, dest, groupByOperatorInfo2, grpByExprs.size(), numReducers, groupingSetsPresent);
        // ////// Generate GroupbyOperator3
        return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo2, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, groupingSetsPresent);
    } else {
        // If there are no grouping keys, grouping sets cannot be present
        assert !groupingSetsPresent;
        // ////// Generate ReduceSink Operator
        Operator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, groupByOperatorInfo, grpByExprs, grpByExprs.size(), false, 1, true, groupingSetsPresent);
        return genGroupByPlanGroupByOperator2MR(parseInfo, dest, reduceSinkOperatorInfo, GroupByDesc.Mode.FINAL, genericUDAFEvaluators, false);
    }
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) LinkedHashMap(java.util.LinkedHashMap) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 23 with GenericUDAFEvaluator

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator in project SQLWindowing by hbutani.

the class WindowingTableFunction method execute.

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public void execute(PartitionIterator<Object> pItr, Partition outP) throws WindowingException {
    ArrayList<List<?>> oColumns = new ArrayList<List<?>>();
    Partition iPart = pItr.getPartition();
    StructObjectInspector inputOI;
    try {
        inputOI = (StructObjectInspector) iPart.getSerDe().getObjectInspector();
    } catch (SerDeException se) {
        throw new WindowingException(se);
    }
    try {
        for (WindowFunctionDef wFn : wFnDefs) {
            boolean processWindow = wFn.getWindow() != null;
            pItr.reset();
            if (!processWindow) {
                GenericUDAFEvaluator fEval = wFn.getEvaluator();
                Object[] args = new Object[wFn.getArgs().size()];
                AggregationBuffer aggBuffer = fEval.getNewAggregationBuffer();
                while (pItr.hasNext()) {
                    Object row = pItr.next();
                    int i = 0;
                    for (ArgDef arg : wFn.getArgs()) {
                        args[i++] = arg.getExprEvaluator().evaluate(row);
                    }
                    fEval.aggregate(aggBuffer, args);
                }
                Object out = fEval.evaluate(aggBuffer);
                WindowFunctionInfo wFnInfo = FunctionRegistry.getWindowFunctionInfo(wFn.getSpec().getName());
                if (!wFnInfo.isPivotResult()) {
                    out = new SameList(iPart.size(), out);
                }
                oColumns.add((List<?>) out);
            } else {
                oColumns.add(executeFnwithWindow(getQueryDef(), wFn, iPart));
            }
        }
        for (int i = 0; i < iPart.size(); i++) {
            ArrayList oRow = new ArrayList();
            Object iRow = iPart.getAt(i);
            for (StructField f : inputOI.getAllStructFieldRefs()) {
                oRow.add(inputOI.getStructFieldData(iRow, f));
            }
            for (int j = 0; j < oColumns.size(); j++) {
                oRow.add(oColumns.get(j).get(i));
            }
            outP.append(oRow);
        }
    } catch (HiveException he) {
        throw new WindowingException(he);
    }
}
Also used : Partition(com.sap.hadoop.windowing.runtime2.Partition) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ArgDef(com.sap.hadoop.windowing.query2.definition.ArgDef) WindowFunctionInfo(com.sap.hadoop.windowing.functions2.FunctionRegistry.WindowFunctionInfo) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) SameList(com.sap.hadoop.ds.SameList) WindowingException(com.sap.hadoop.windowing.WindowingException) ArrayList(java.util.ArrayList) SameList(com.sap.hadoop.ds.SameList) List(java.util.List) AggregationBuffer(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer) WindowFunctionDef(com.sap.hadoop.windowing.query2.definition.WindowFunctionDef) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

GenericUDAFEvaluator (org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator)23 ArrayList (java.util.ArrayList)16 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)8 LinkedHashMap (java.util.LinkedHashMap)7 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)7 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)7 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)7 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)7 Operator (org.apache.hadoop.hive.ql.exec.Operator)7 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)7 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)7 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)7 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 CalciteSemanticException (org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)7 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)7 AggregationDesc (org.apache.hadoop.hive.ql.plan.AggregationDesc)6 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)6 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)6