Search in sources :

Example 46 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SplitSparkWorkResolver method splitBaseWork.

// Split work into multiple branches, one for each childWork in childWorks.
// It also set up the connection between each parent work and child work.
private void splitBaseWork(SparkWork sparkWork, BaseWork parentWork, List<BaseWork> childWorks) {
    if (getAllReduceSinks(parentWork).size() <= 1) {
        return;
    }
    // Grand-parent works - we need to set these to be the parents of the cloned works.
    List<BaseWork> grandParentWorks = sparkWork.getParents(parentWork);
    boolean isFirst = true;
    for (BaseWork childWork : childWorks) {
        BaseWork clonedParentWork = SerializationUtilities.cloneBaseWork(parentWork);
        // give the cloned work a different name
        clonedParentWork.setName(clonedParentWork.getName().replaceAll("^([a-zA-Z]+)(\\s+)(\\d+)", "$1$2" + GenSparkUtils.getUtils().getNextSeqNumber()));
        setStatistics(parentWork, clonedParentWork);
        String childReducerName = childWork.getName();
        SparkEdgeProperty clonedEdgeProperty = sparkWork.getEdgeProperty(parentWork, childWork);
        // the corresponding ReduceSinkOperator.
        for (Operator<?> op : clonedParentWork.getAllLeafOperators()) {
            if (op instanceof ReduceSinkOperator) {
                if (!((ReduceSinkOperator) op).getConf().getOutputName().equals(childReducerName)) {
                    removeOpRecursive(op);
                }
            } else if (!isFirst) {
                removeOpRecursive(op);
            }
        }
        isFirst = false;
        // Then, we need to set up the graph connection. Especially:
        // 1, we need to connect this cloned parent work with all the grand-parent works.
        // 2, we need to connect this cloned parent work with the corresponding child work.
        sparkWork.add(clonedParentWork);
        for (BaseWork gpw : grandParentWorks) {
            sparkWork.connect(gpw, clonedParentWork, sparkWork.getEdgeProperty(gpw, parentWork));
        }
        sparkWork.connect(clonedParentWork, childWork, clonedEdgeProperty);
        sparkWork.getCloneToWork().put(clonedParentWork, parentWork);
    }
    sparkWork.remove(parentWork);
}
Also used : SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 47 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class HiveGBOpConvUtil method genReduceSideGB2.

private static OpAttr genReduceSideGB2(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
    ArrayList<String> outputColNames = new ArrayList<String>();
    ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    String colOutputName = null;
    ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
    List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature();
    ColumnInfo ci;
    // 1. Build GB Keys, grouping set starting position
    // 1.1 First Add original GB Keys
    ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, false, false);
    for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
        ci = rsColInfoLst.get(i);
        colOutputName = gbInfo.outputColNames.get(i);
        outputColNames.add(colOutputName);
        colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false));
        colExprMap.put(colOutputName, gbKeys.get(i));
    }
    // 1.2 Add GrpSet Col
    int groupingSetsPosition = -1;
    if (inclGrpSetInReduceSide(gbInfo) && gbInfo.grpIdFunctionNeeded) {
        groupingSetsPosition = gbKeys.size();
        ExprNodeDesc grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, rsColInfoLst.get(groupingSetsPosition).getInternalName(), null, false);
        gbKeys.add(grpSetColExpr);
        colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1);
        ;
        outputColNames.add(colOutputName);
        colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true));
        colExprMap.put(colOutputName, grpSetColExpr);
    }
    // 2. Add UDAF
    UDAFAttrs udafAttr;
    ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    int udafStartPosInGBInfOutputColNames = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() * 2;
    int udafStartPosInInputRS = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() + 1;
    for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
        udafAttr = gbInfo.udafAttrs.get(i);
        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafStartPosInInputRS + i)));
        colOutputName = gbInfo.outputColNames.get(udafStartPosInGBInfOutputColNames + i);
        outputColNames.add(colOutputName);
        Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.FINAL, udafAttr.isDistinctUDAF);
        GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, aggParameters);
        aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, false, udafMode));
        colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
    }
    Operator rsGBOp2 = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.FINAL, outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, false, groupingSetsPosition, gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs);
    rsGBOp2.setColumnExprMap(colExprMap);
    // TODO: Shouldn't we propgate vc? is it vc col from tab or all vc
    return new OpAttr("", new HashSet<Integer>(), rsGBOp2);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) HashMap(java.util.HashMap) GenericUDAFInfo(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) OpAttr(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc)

Example 48 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class HiveGBOpConvUtil method genMapSideGBRS.

private static OpAttr genMapSideGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    List<String> outputKeyColumnNames = new ArrayList<String>();
    List<String> outputValueColumnNames = new ArrayList<String>();
    ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>();
    GroupByOperator mapGB = (GroupByOperator) inputOpAf.inputs.get(0);
    ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForRS(mapGB, 0, gbInfo.gbKeys.size() - 1, outputKeyColumnNames, false, colInfoLst, colExprMap, false, false);
    int keyLength = reduceKeys.size();
    if (inclGrpSetInMapSide(gbInfo)) {
        addGrpSetCol(false, SemanticAnalyzer.getColumnInternalName(reduceKeys.size()), true, reduceKeys, outputKeyColumnNames, colInfoLst, colExprMap);
        keyLength++;
    }
    if (mapGB.getConf().getKeys().size() > reduceKeys.size()) {
        // NOTE: All dist cols have single output col name;
        reduceKeys.addAll(getReduceKeysForRS(mapGB, reduceKeys.size(), mapGB.getConf().getKeys().size() - 1, outputKeyColumnNames, true, colInfoLst, colExprMap, false, false));
    } else if (!gbInfo.distColIndices.isEmpty()) {
        // This is the case where distinct cols are part of GB Keys in which case
        // we still need to add it to out put col names
        outputKeyColumnNames.add(SemanticAnalyzer.getColumnInternalName(reduceKeys.size()));
    }
    ArrayList<ExprNodeDesc> reduceValues = getValueKeysForRS(mapGB, mapGB.getConf().getKeys().size(), outputValueColumnNames, colInfoLst, colExprMap, false, false);
    ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, keyLength, reduceValues, gbInfo.distColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, getNumPartFieldsForMapSideRS(gbInfo), getParallelismForMapSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), mapGB);
    rsOp.setColumnExprMap(colExprMap);
    return new OpAttr("", new HashSet<Integer>(), rsOp);
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) OpAttr(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 49 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class AbstractSMBJoinProc method canConvertJoinToBucketMapJoin.

// Can the join operator be converted to a bucket map-merge join operator ?
@SuppressWarnings("unchecked")
protected boolean canConvertJoinToBucketMapJoin(JoinOperator joinOp, SortBucketJoinProcCtx context) throws SemanticException {
    // This has already been inspected and rejected
    if (context.getRejectedJoinOps().contains(joinOp)) {
        return false;
    }
    if (!this.pGraphContext.getJoinOps().contains(joinOp)) {
        return false;
    }
    Class<? extends BigTableSelectorForAutoSMJ> bigTableMatcherClass = null;
    try {
        String selector = HiveConf.getVar(pGraphContext.getConf(), HiveConf.ConfVars.HIVE_AUTO_SORTMERGE_JOIN_BIGTABLE_SELECTOR);
        bigTableMatcherClass = JavaUtils.loadClass(selector);
    } catch (ClassNotFoundException e) {
        throw new SemanticException(e.getMessage());
    }
    BigTableSelectorForAutoSMJ bigTableMatcher = ReflectionUtils.newInstance(bigTableMatcherClass, null);
    JoinDesc joinDesc = joinOp.getConf();
    JoinCondDesc[] joinCondns = joinDesc.getConds();
    Set<Integer> joinCandidates = MapJoinProcessor.getBigTableCandidates(joinCondns);
    if (joinCandidates.isEmpty()) {
        // of any type. So return false.
        return false;
    }
    int bigTablePosition = bigTableMatcher.getBigTablePosition(pGraphContext, joinOp, joinCandidates);
    if (bigTablePosition < 0) {
        // contains aliases from sub-query
        return false;
    }
    context.setBigTablePosition(bigTablePosition);
    String joinAlias = bigTablePosition == 0 ? joinOp.getConf().getLeftAlias() : joinOp.getConf().getRightAliases()[bigTablePosition - 1];
    joinAlias = QB.getAppendedAliasFromId(joinOp.getConf().getId(), joinAlias);
    Map<Byte, List<ExprNodeDesc>> keyExprMap = new HashMap<Byte, List<ExprNodeDesc>>();
    List<Operator<? extends OperatorDesc>> parentOps = joinOp.getParentOperators();
    // get the join keys from parent ReduceSink operators
    for (Operator<? extends OperatorDesc> parentOp : parentOps) {
        ReduceSinkDesc rsconf = ((ReduceSinkOperator) parentOp).getConf();
        Byte tag = (byte) rsconf.getTag();
        List<ExprNodeDesc> keys = rsconf.getKeyCols();
        keyExprMap.put(tag, keys);
    }
    context.setKeyExprMap(keyExprMap);
    // Make a deep copy of the aliases so that they are not changed in the context
    String[] joinSrcs = joinOp.getConf().getBaseSrc();
    String[] srcs = new String[joinSrcs.length];
    for (int srcPos = 0; srcPos < joinSrcs.length; srcPos++) {
        joinSrcs[srcPos] = QB.getAppendedAliasFromId(joinOp.getConf().getId(), joinSrcs[srcPos]);
        srcs[srcPos] = new String(joinSrcs[srcPos]);
    }
    // table matcher.
    return checkConvertBucketMapJoin(context, joinOp.getConf().getAliasToOpInfo(), keyExprMap, joinAlias, Arrays.asList(srcs));
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) HashMap(java.util.HashMap) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 50 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class ConvertJoinMapJoin method checkConvertJoinBucketMapJoin.

/*
   * If the parent reduce sink of the big table side has the same emit key cols as its parent, we
   * can create a bucket map join eliminating the reduce sink.
   */
private boolean checkConvertJoinBucketMapJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
    // constituent reduce sinks
    if (!(joinOp.getParentOperators().get(0) instanceof ReduceSinkOperator)) {
        LOG.info("Operator is " + joinOp.getParentOperators().get(0).getName() + ". Cannot convert to bucket map join");
        return false;
    }
    ReduceSinkOperator rs = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
    List<List<String>> parentColNames = rs.getOpTraits().getBucketColNames();
    Operator<? extends OperatorDesc> parentOfParent = rs.getParentOperators().get(0);
    List<List<String>> grandParentColNames = parentOfParent.getOpTraits().getBucketColNames();
    int numBuckets = parentOfParent.getOpTraits().getNumBuckets();
    // all keys matched.
    if (checkColEquality(grandParentColNames, parentColNames, rs.getColumnExprMap(), tezBucketJoinProcCtx, true) == false) {
        LOG.info("No info available to check for bucket map join. Cannot convert");
        return false;
    }
    /*
     * this is the case when the big table is a sub-query and is probably already bucketed by the
     * join column in say a group by operation
     */
    if (numBuckets < 0) {
        numBuckets = rs.getConf().getNumReducers();
    }
    tezBucketJoinProcCtx.setNumBuckets(numBuckets);
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)86 Operator (org.apache.hadoop.hive.ql.exec.Operator)50 ArrayList (java.util.ArrayList)48 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)45 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)35 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)31 HashMap (java.util.HashMap)29 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)28 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)27 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)26 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)24 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)23 List (java.util.List)19 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)19 LinkedHashMap (java.util.LinkedHashMap)18 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)18 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)18 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)15