Search in sources :

Example 11 with OpAttr

use of org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr in project hive by apache.

the class HiveGBOpConvUtil method genMapSideRS.

private static OpAttr genMapSideRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    List<String> outputKeyColumnNames = new ArrayList<String>();
    List<String> outputValueColumnNames = new ArrayList<String>();
    ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
    String outputColName;
    // 1. Add GB Keys to reduce keys
    ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
    for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
        //gbInfo already has ExprNode for gbkeys
        reduceKeys.add(gbInfo.gbKeys.get(i));
        String colOutputName = SemanticAnalyzer.getColumnInternalName(i);
        outputKeyColumnNames.add(colOutputName);
        colInfoLst.add(new ColumnInfo(Utilities.ReduceField.KEY.toString() + "." + colOutputName, gbInfo.gbKeyTypes.get(i), "", false));
        colExprMap.put(colOutputName, gbInfo.gbKeys.get(i));
    }
    // Note: GROUPING SETS are not allowed with map side aggregation set to false so we don't have to worry about it
    int keyLength = reduceKeys.size();
    // 2. Add Dist UDAF args to reduce keys
    if (gbInfo.containsDistinctAggr) {
        // TODO: Why is this needed (doesn't represent any cols)
        String udafName = SemanticAnalyzer.getColumnInternalName(reduceKeys.size());
        outputKeyColumnNames.add(udafName);
        for (int i = 0; i < gbInfo.distExprNodes.size(); i++) {
            reduceKeys.add(gbInfo.distExprNodes.get(i));
            //this part of reduceKeys is later used to create column names strictly for non-distinct aggregates
            // with parameters same as distinct keys which expects _col0 at the end. So we always append
            // _col0 at the end instead of _col<i>
            outputColName = SemanticAnalyzer.getColumnInternalName(0);
            String field = Utilities.ReduceField.KEY.toString() + "." + udafName + ":" + i + "." + outputColName;
            ColumnInfo colInfo = new ColumnInfo(field, gbInfo.distExprNodes.get(i).getTypeInfo(), null, false);
            colInfoLst.add(colInfo);
            colExprMap.put(field, gbInfo.distExprNodes.get(i));
        }
    }
    // 3. Add UDAF args deduped to reduce values
    ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
    for (int i = 0; i < gbInfo.deDupedNonDistIrefs.size(); i++) {
        reduceValues.add(gbInfo.deDupedNonDistIrefs.get(i));
        outputColName = SemanticAnalyzer.getColumnInternalName(reduceValues.size() - 1);
        outputValueColumnNames.add(outputColName);
        String field = Utilities.ReduceField.VALUE.toString() + "." + outputColName;
        colInfoLst.add(new ColumnInfo(field, reduceValues.get(reduceValues.size() - 1).getTypeInfo(), null, false));
        colExprMap.put(field, reduceValues.get(reduceValues.size() - 1));
    }
    // 4. Gen RS
    ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, keyLength, reduceValues, gbInfo.distColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, getNumPartFieldsForMapSideRS(gbInfo), getParallelismForMapSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), inputOpAf.inputs.get(0));
    rsOp.setColumnExprMap(colExprMap);
    return new OpAttr("", new HashSet<Integer>(), rsOp);
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) OpAttr(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 12 with OpAttr

use of org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr in project hive by apache.

the class HiveGBOpConvUtil method genReduceSideGB1.

private static OpAttr genReduceSideGB1(OpAttr inputOpAf, GBInfo gbInfo, boolean computeGrpSet, boolean propagateConstInDistinctUDAF, GroupByDesc.Mode gbMode) throws SemanticException {
    ArrayList<String> outputColNames = new ArrayList<String>();
    ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    String colOutputName = null;
    ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
    List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature();
    ColumnInfo ci;
    boolean finalGB = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB);
    // 1. Build GB Keys, grouping set starting position
    // 1.1 First Add original GB Keys
    ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, false, false);
    for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
        ci = rsColInfoLst.get(i);
        if (finalGB) {
            colOutputName = gbInfo.outputColNames.get(i);
        } else {
            colOutputName = SemanticAnalyzer.getColumnInternalName(i);
        }
        outputColNames.add(colOutputName);
        colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false));
        colExprMap.put(colOutputName, gbKeys.get(i));
    }
    // 1.2 Add GrpSet Col
    int groupingSetsColPosition = -1;
    if ((!finalGB && gbInfo.grpSets.size() > 0) || (finalGB && gbInfo.grpIdFunctionNeeded)) {
        groupingSetsColPosition = gbInfo.gbKeys.size();
        if (computeGrpSet) {
            // GrpSet Col needs to be constructed
            gbKeys.add(new ExprNodeConstantDesc("0"));
        } else {
            // GrpSet Col already part of input RS
            // TODO: Can't we just copy the ExprNodeDEsc from input (Do we need to
            // explicitly set table alias to null & VC to false
            gbKeys.addAll(ExprNodeDescUtils.genExprNodeDesc(rs, groupingSetsColPosition, groupingSetsColPosition, false, true));
        }
        colOutputName = SemanticAnalyzer.getColumnInternalName(groupingSetsColPosition);
        if (finalGB) {
            colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1);
        }
        outputColNames.add(colOutputName);
        colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true));
        colExprMap.put(colOutputName, gbKeys.get(groupingSetsColPosition));
    }
    // 2. Walk through UDAF and add them to GB
    String lastReduceKeyColName = null;
    if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) {
        lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames().get(rs.getConf().getOutputKeyColumnNames().size() - 1);
    }
    int numDistinctUDFs = 0;
    int distinctStartPosInReduceKeys = gbKeys.size();
    List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    int udafColStartPosInOriginalGB = (gbInfo.grpSets.size() > 0) ? gbInfo.gbKeys.size() * 2 : gbInfo.gbKeys.size();
    int udafColStartPosInRS = rs.getConf().getKeyCols().size();
    for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
        UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        if (udafAttr.isDistinctUDAF) {
            ColumnInfo rsDistUDAFParamColInfo;
            ExprNodeDesc distinctUDAFParam;
            ExprNodeDesc constantPropDistinctUDAFParam;
            for (int j = 0; j < udafAttr.udafParamsIndxInGBInfoDistExprs.size(); j++) {
                rsDistUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j);
                String rsDistUDAFParamName = rsDistUDAFParamColInfo.getInternalName();
                // TODO: verify if this is needed
                if (lastReduceKeyColName != null) {
                    rsDistUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j);
                }
                distinctUDAFParam = new ExprNodeColumnDesc(rsDistUDAFParamColInfo.getType(), rsDistUDAFParamName, rsDistUDAFParamColInfo.getTabAlias(), rsDistUDAFParamColInfo.getIsVirtualCol());
                if (propagateConstInDistinctUDAF) {
                    // TODO: Implement propConstDistUDAFParams
                    constantPropDistinctUDAFParam = SemanticAnalyzer.isConstantParameterInAggregationParameters(rsDistUDAFParamColInfo.getInternalName(), reduceValues);
                    if (constantPropDistinctUDAFParam != null) {
                        distinctUDAFParam = constantPropDistinctUDAFParam;
                    }
                }
                aggParameters.add(distinctUDAFParam);
            }
            numDistinctUDFs++;
        } else {
            aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafColStartPosInRS + i)));
        }
        Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF);
        GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, aggParameters);
        aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (gbMode != GroupByDesc.Mode.FINAL && udafAttr.isDistinctUDAF), udafMode));
        if (finalGB) {
            colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i);
        } else {
            colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1);
        }
        colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
        outputColNames.add(colOutputName);
    }
    // Nothing special needs to be done for grouping sets if
    // this is the final group by operator, and multiple rows corresponding to
    // the
    // grouping sets have been generated upstream.
    // However, if an addition MR job has been created to handle grouping sets,
    // additional rows corresponding to grouping sets need to be created here.
    //TODO: Clean up/refactor assumptions
    boolean includeGrpSetInGBDesc = (gbInfo.grpSets.size() > 0) && !finalGB && !(gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT);
    Operator rsGBOp = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, gbKeys, aggregations, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, gbInfo.grpSets, includeGrpSetInGBDesc, groupingSetsColPosition, gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs);
    rsGBOp.setColumnExprMap(colExprMap);
    return new OpAttr("", new HashSet<Integer>(), rsGBOp);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) GenericUDAFInfo(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) OpAttr(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc)

Example 13 with OpAttr

use of org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr in project hive by apache.

the class HiveGBOpConvUtil method genReduceSideGB1NoMapGB.

/**
   * RS-GB0
   *
   * @param inputOpAf
   * @param gbInfo
   * @param gbMode
   * @return
   * @throws SemanticException
   */
private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, GroupByDesc.Mode gbMode) throws SemanticException {
    ArrayList<String> outputColNames = new ArrayList<String>();
    ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    String colOutputName = null;
    ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
    List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature();
    ColumnInfo ci;
    boolean useOriginalGBNames = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW);
    // 1. Build GB Keys, grouping set starting position
    // 1.1 First Add original GB Keys
    ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, true, false);
    for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
        ci = rsColInfoLst.get(i);
        if (useOriginalGBNames) {
            colOutputName = gbInfo.outputColNames.get(i);
        } else {
            colOutputName = SemanticAnalyzer.getColumnInternalName(i);
        }
        outputColNames.add(colOutputName);
        colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), null, false));
        colExprMap.put(colOutputName, gbKeys.get(i));
    }
    // 2. Walk through UDAF and add them to GB
    String lastReduceKeyColName = null;
    if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) {
        lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames().get(rs.getConf().getOutputKeyColumnNames().size() - 1);
    }
    int numDistinctUDFs = 0;
    List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
    // the positions in rsColInfoLst are as follows
    // --grpkey--,--distkey--,--values--
    // but distUDAF may be before/after some non-distUDAF,
    // i.e., their positions can be mixed.
    // so for all UDAF we first check to see if it is groupby key, if not is it distinct key
    // if not it should be value
    List<Integer> distinctPositions = new ArrayList<>();
    Map<Integer, ArrayList<ExprNodeDesc>> indexToParameter = new TreeMap<>();
    for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
        UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        ColumnInfo rsUDAFParamColInfo;
        ExprNodeDesc udafParam;
        ExprNodeDesc constantPropDistinctUDAFParam;
        for (int j = 0; j < udafAttr.udafParams.size(); j++) {
            int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
            rsUDAFParamColInfo = rsColInfoLst.get(argPos);
            String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
            if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
                rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j);
            }
            udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName, rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol());
            constantPropDistinctUDAFParam = SemanticAnalyzer.isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), reduceValues);
            if (constantPropDistinctUDAFParam != null) {
                udafParam = constantPropDistinctUDAFParam;
            }
            aggParameters.add(udafParam);
        }
        indexToParameter.put(i, aggParameters);
        if (udafAttr.isDistinctUDAF) {
            numDistinctUDFs++;
        }
    }
    for (int index : indexToParameter.keySet()) {
        UDAFAttrs udafAttr = gbInfo.udafAttrs.get(index);
        Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF);
        GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, indexToParameter.get(index));
        aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, udafAttr.isDistinctUDAF, udafMode));
        if (useOriginalGBNames) {
            colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + index);
        } else {
            colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1);
        }
        colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
        outputColNames.add(colOutputName);
    }
    Operator rsGB1 = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, false, -1, numDistinctUDFs > 0), new RowSchema(colInfoLst), rs);
    rsGB1.setColumnExprMap(colExprMap);
    return new OpAttr("", new HashSet<Integer>(), rsGB1);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) GenericUDAFInfo(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) OpAttr(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) TreeMap(java.util.TreeMap) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc)

Aggregations

OpAttr (org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr)13 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)7 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)7 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)7 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)7 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)6 Operator (org.apache.hadoop.hive.ql.exec.Operator)4 GenericUDAFInfo (org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo)4 AggregationDesc (org.apache.hadoop.hive.ql.plan.AggregationDesc)4 GroupByDesc (org.apache.hadoop.hive.ql.plan.GroupByDesc)4 Mode (org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode)4 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)3 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)2 HashSet (java.util.HashSet)1 TreeMap (java.util.TreeMap)1 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)1