Search in sources :

Example 41 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class SemanticAnalyzer method genJoinOperatorChildren.

private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Operator[] right, Set<Integer> omitOpts, ExprNodeDesc[][] joinKeys) throws SemanticException {
    RowResolver outputRR = new RowResolver();
    List<String> outputColumnNames = new ArrayList<String>();
    // all children are base classes
    Operator<?>[] rightOps = new Operator[right.length];
    Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
    Map<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    Map<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
    Map<Byte, List<ExprNodeDesc>> filterMap = new HashMap<Byte, List<ExprNodeDesc>>();
    // Only used for semijoin with residual predicates
    List<ColumnInfo> topSelectInputColumns = new ArrayList<>();
    for (int pos = 0; pos < right.length; ++pos) {
        Operator<?> input = right[pos] == null ? left : right[pos];
        if (input == null) {
            input = left;
        }
        ReduceSinkOperator rs = (ReduceSinkOperator) input;
        if (rs.getNumParent() != 1) {
            throw new SemanticException("RS should have single parent");
        }
        Operator<?> parent = rs.getParentOperators().get(0);
        ReduceSinkDesc rsDesc = (ReduceSinkDesc) (input.getConf());
        int[] index = rs.getValueIndex();
        List<ExprNodeDesc> valueDesc = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> filterDesc = new ArrayList<ExprNodeDesc>();
        Byte tag = (byte) rsDesc.getTag();
        // we will add a Select on top of the join
        if (omitOpts != null && omitOpts.contains(pos) && join.getPostJoinFilters().size() == 0) {
            exprMap.put(tag, valueDesc);
            filterMap.put(tag, filterDesc);
            rightOps[pos] = input;
            continue;
        }
        List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
        List<String> valColNames = rsDesc.getOutputValueColumnNames();
        // prepare output descriptors for the input opt
        RowResolver inputRR = opParseCtx.get(input).getRowResolver();
        RowResolver parentRR = opParseCtx.get(parent).getRowResolver();
        posToAliasMap.put(pos, new HashSet<String>(inputRR.getTableNames()));
        List<ColumnInfo> columns = parentRR.getColumnInfos();
        for (int i = 0; i < index.length; i++) {
            ColumnInfo prev = columns.get(i);
            String[] nm = parentRR.reverseLookup(prev.getInternalName());
            String[] nm2 = parentRR.getAlternateMappings(prev.getInternalName());
            if (outputRR.get(nm[0], nm[1]) != null) {
                continue;
            }
            ColumnInfo info = new ColumnInfo(prev);
            String field;
            if (index[i] >= 0) {
                field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]);
            } else {
                field = Utilities.ReduceField.VALUE + "." + valColNames.get(-index[i] - 1);
            }
            String internalName = getColumnInternalName(outputColumnNames.size());
            ExprNodeColumnDesc desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), info.getIsVirtualCol());
            info.setInternalName(internalName);
            colExprMap.put(internalName, desc);
            outputRR.put(nm[0], nm[1], info);
            if (nm2 != null) {
                outputRR.addMappingOnly(nm2[0], nm2[1], info);
            }
            valueDesc.add(desc);
            outputColumnNames.add(internalName);
            reversedExprs.put(internalName, tag);
            // Populate semijoin select if needed
            if (omitOpts == null || !omitOpts.contains(pos)) {
                topSelectInputColumns.add(info);
            }
        }
        for (ASTNode cond : join.getFilters().get(tag)) {
            filterDesc.add(genExprNodeDesc(cond, inputRR));
        }
        exprMap.put(tag, valueDesc);
        filterMap.put(tag, filterDesc);
        rightOps[pos] = input;
    }
    JoinCondDesc[] joinCondns = new JoinCondDesc[join.getJoinCond().length];
    for (int i = 0; i < join.getJoinCond().length; i++) {
        JoinCond condn = join.getJoinCond()[i];
        joinCondns[i] = new JoinCondDesc(condn);
    }
    JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, join.getNoOuterJoin(), joinCondns, filterMap, joinKeys, null);
    desc.setReversedExprs(reversedExprs);
    desc.setFilterMap(join.getFilterMap());
    // Add filters that apply to more than one input
    if (join.getPostJoinFilters().size() != 0 && (!join.getNoOuterJoin() || !join.getNoSemiJoin() || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) {
        LOG.debug("Generate JOIN with post-filtering conditions");
        List<ExprNodeDesc> residualFilterExprs = new ArrayList<ExprNodeDesc>();
        for (ASTNode cond : join.getPostJoinFilters()) {
            residualFilterExprs.add(genExprNodeDesc(cond, outputRR, false, isCBOExecuted()));
        }
        desc.setResidualFilterExprs(residualFilterExprs);
        // Clean post-conditions
        join.getPostJoinFilters().clear();
    }
    JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(getOpContext(), desc, new RowSchema(outputRR.getColumnInfos()), rightOps);
    joinOp.setColumnExprMap(colExprMap);
    joinOp.setPosToAliasMap(posToAliasMap);
    if (join.getNullSafes() != null) {
        boolean[] nullsafes = new boolean[join.getNullSafes().size()];
        for (int i = 0; i < nullsafes.length; i++) {
            nullsafes[i] = join.getNullSafes().get(i);
        }
        desc.setNullSafes(nullsafes);
    }
    Operator<?> topOp = putOpInsertMap(joinOp, outputRR);
    if (omitOpts != null && !omitOpts.isEmpty() && desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) {
        // Adding a select operator to top of semijoin to ensure projection of only correct columns
        final List<ExprNodeDesc> topSelectExprs = new ArrayList<>();
        final List<String> topSelectOutputColNames = new ArrayList<>();
        final RowResolver topSelectRR = new RowResolver();
        final Map<String, ExprNodeDesc> topSelectColExprMap = new HashMap<String, ExprNodeDesc>();
        for (ColumnInfo colInfo : topSelectInputColumns) {
            ExprNodeColumnDesc columnExpr = new ExprNodeColumnDesc(colInfo);
            topSelectExprs.add(columnExpr);
            topSelectOutputColNames.add(colInfo.getInternalName());
            topSelectColExprMap.put(colInfo.getInternalName(), columnExpr);
            String[] nm = outputRR.reverseLookup(columnExpr.getColumn());
            String[] nm2 = outputRR.getAlternateMappings(columnExpr.getColumn());
            topSelectRR.put(nm[0], nm[1], colInfo);
            if (nm2 != null) {
                topSelectRR.addMappingOnly(nm2[0], nm2[1], colInfo);
            }
        }
        final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames);
        topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR);
        topOp.setColumnExprMap(topSelectColExprMap);
    }
    return topOp;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SortedSet(java.util.SortedSet) HashSet(java.util.HashSet) Set(java.util.Set) TreeSet(java.util.TreeSet) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) ValidTxnWriteIdList(org.apache.hadoop.hive.common.ValidTxnWriteIdList) ValidTxnList(org.apache.hadoop.hive.common.ValidTxnList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) LateralViewJoinDesc(org.apache.hadoop.hive.ql.plan.LateralViewJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 42 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class SemanticAnalyzer method genGroupByPlanGroupByOperator1.

/**
 * Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
 * The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
 *
 * @param parseInfo
 * @param dest
 * @param reduceSinkOperatorInfo
 * @param mode
 *          The mode of the aggregation (MERGEPARTIAL, PARTIAL2)
 * @param genericUDAFEvaluators
 *          The mapping from Aggregation StringTree to the
 *          genericUDAFEvaluator.
 * @param groupingSets
 *          list of grouping sets
 * @param groupingSetsPresent
 *          whether grouping sets are present in this query
 * @param groupingSetsNeedAdditionalMRJob
 *          whether grouping sets are consumed by this group by
 * @return the new GroupByOperator
 */
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Long> groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException {
    List<String> outputColumnNames = new ArrayList<String>();
    RowResolver groupByInputRowResolver = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
    RowResolver groupByOutputRowResolver = new RowResolver();
    groupByOutputRowResolver.setIsExprResolver(true);
    List<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
    List<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
    List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    for (int i = 0; i < grpByExprs.size(); ++i) {
        ASTNode grpbyExpr = grpByExprs.get(i);
        ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
        if (exprInfo == null) {
            throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), grpbyExpr));
        }
        groupByKeys.add(new ExprNodeColumnDesc(exprInfo));
        String field = getColumnInternalName(i);
        outputColumnNames.add(field);
        ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
        groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
        addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo, groupByOutputRowResolver);
        colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
    }
    // This is only needed if a new grouping set key is being created
    int groupingSetsPosition = -1;
    // For grouping sets, add a dummy grouping key
    if (groupingSetsPresent) {
        groupingSetsPosition = groupByKeys.size();
        // This function is called for GroupBy2 to add grouping id as part of the groupby keys
        if (!groupingSetsNeedAdditionalMRJob) {
            addGroupingSetKey(groupByKeys, groupByInputRowResolver, groupByOutputRowResolver, outputColumnNames, colExprMap);
        } else {
            // The grouping set has not yet been processed. Create a new grouping key
            // Consider the query: select a,b, count(1) from T group by a,b with cube;
            // where it is being executed in 2 map-reduce jobs
            // The plan for 1st MR is TableScan -> GroupBy1 -> ReduceSink -> GroupBy2 -> FileSink
            // GroupBy1/ReduceSink worked as if grouping sets were not present
            // This function is called for GroupBy2 to create new rows for grouping sets
            // For each input row (a,b), 4 rows are created for the example above:
            // (a,b), (a,null), (null, b), (null, null)
            createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
        }
    }
    Map<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
    // get the last colName for the reduce KEY
    // it represents the column name corresponding to distinct aggr, if any
    String lastKeyColName = null;
    List<ExprNodeDesc> reduceValues = null;
    if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) {
        List<String> inputKeyCols = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames();
        if (inputKeyCols.size() > 0) {
            lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
        }
        reduceValues = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getValueCols();
    }
    int numDistinctUDFs = 0;
    boolean containsDistinctAggr = false;
    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
        ASTNode value = entry.getValue();
        String aggName = unescapeIdentifier(value.getChild(0).getText());
        List<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
        boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI);
        containsDistinctAggr = containsDistinctAggr || isDistinct;
        // side, so always look for the parameters: d+e
        if (isDistinct) {
            // 0 is the function name
            for (int i = 1; i < value.getChildCount(); i++) {
                ASTNode paraExpr = (ASTNode) value.getChild(i);
                ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
                if (paraExprInfo == null) {
                    throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), paraExpr));
                }
                String paraExpression = paraExprInfo.getInternalName();
                assert (paraExpression != null);
                if (lastKeyColName != null) {
                    // if aggr is distinct, the parameter is name is constructed as
                    // KEY.lastKeyColName:<tag>._colx
                    paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
                }
                ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
                ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
                if (reduceValue != null) {
                    // this parameter is a constant
                    expr = reduceValue;
                }
                aggParameters.add(expr);
            }
        } else {
            ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value);
            if (paraExprInfo == null) {
                throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), value));
            }
            String paraExpression = paraExprInfo.getInternalName();
            assert (paraExpression != null);
            aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
        }
        if (isDistinct) {
            numDistinctUDFs++;
        }
        Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
        GenericUDAFEvaluator genericUDAFEvaluator = null;
        genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
        assert (genericUDAFEvaluator != null);
        GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
        aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && isDistinct), amode));
        String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
        outputColumnNames.add(field);
        groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
    }
    float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
    float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
    float minReductionHashAggr = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
    float minReductionHashAggrLowerBound = HiveConf.getFloatVar(conf, ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
    // Nothing special needs to be done for grouping sets if
    // this is the final group by operator, and multiple rows corresponding to the
    // grouping sets have been generated upstream.
    // However, if an addition MR job has been created to handle grouping sets,
    // additional rows corresponding to grouping sets need to be created here.
    Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), reduceSinkOperatorInfo), groupByOutputRowResolver);
    op.setColumnExprMap(colExprMap);
    return op;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) TreeMap(java.util.TreeMap) ImmutableMap(com.google.common.collect.ImmutableMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap)

Example 43 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class SemiJoinReductionMerge method createReduceSink.

/**
 * Creates a reduce sink operator that emits all columns of the parent as values.
 */
private static ReduceSinkOperator createReduceSink(Operator<?> parentOp, NullOrdering nullOrder) throws SemanticException {
    List<ExprNodeDesc> valueCols = new ArrayList<>();
    RowSchema parentSchema = parentOp.getSchema();
    List<String> outColNames = new ArrayList<>();
    for (int i = 0; i < parentSchema.getSignature().size(); i++) {
        ColumnInfo colInfo = parentSchema.getSignature().get(i);
        ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(colInfo.getType(), colInfo.getInternalName(), "", false);
        valueCols.add(colExpr);
        outColNames.add(SemanticAnalyzer.getColumnInternalName(i));
    }
    ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(Collections.emptyList(), valueCols, outColNames, false, -1, 0, 1, AcidUtils.Operation.NOT_ACID, nullOrder);
    rsDesc.setColumnExprMap(Collections.emptyMap());
    return (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(parentSchema), parentOp);
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 44 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class SetReducerParallelism method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeTezProcContext context = (OptimizeTezProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
    int maxReducers = context.conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (context.visitedReduceSinks.contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.visitedReduceSinks.add(sink);
    if (desc.getNumReducers() <= 0) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            long numberOfBytes = 0;
            // we need to add up all the estimates from the siblings of this reduce sink
            for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                if (sibling.getStatistics() != null) {
                    numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
                } else {
                    LOG.warn("No stats available from: " + sibling);
                }
            }
            int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
            LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers);
            desc.setNumReducers(numReducers);
            final Collection<ExprNodeDescEqualityWrapper> keyCols = ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
            final Collection<ExprNodeDescEqualityWrapper> partCols = ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
            if (keyCols != null && keyCols.equals(partCols)) {
                desc.setReducerTraits(EnumSet.of(UNIFORM, AUTOPARALLEL));
            } else {
                desc.setReducerTraits(EnumSet.of(AUTOPARALLEL));
            }
        }
    } else {
        LOG.info("Number of reducers determined to be: " + desc.getNumReducers());
        // usually controlled by bucketing
        desc.setReducerTraits(EnumSet.of(FIXED));
    }
    return false;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext) ExprNodeDescEqualityWrapper(org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper)

Example 45 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class TopNKeyPushdownProcessor method pushdownThroughReduceSink.

/**
 * Push through ReduceSink. If TopNKey expression is same as ReduceSink expression and order is
 * the same, we can push it and remove it from above ReduceSink. If expression in TopNKey shared
 * common prefix with ReduceSink including same order, TopNKey could be pushed through
 * ReduceSink using that prefix and kept above it.
 *
 * @param topNKey TopNKey operator to push
 * @throws SemanticException when removeChildAndAdoptItsChildren was not successful
 */
private void pushdownThroughReduceSink(TopNKeyOperator topNKey) throws SemanticException {
    ReduceSinkOperator reduceSink = (ReduceSinkOperator) topNKey.getParentOperators().get(0);
    final ReduceSinkDesc reduceSinkDesc = reduceSink.getConf();
    final TopNKeyDesc topNKeyDesc = topNKey.getConf();
    CommonKeyPrefix commonKeyPrefix = CommonKeyPrefix.map(topNKeyDesc, reduceSinkDesc);
    if (commonKeyPrefix.isEmpty() || commonKeyPrefix.size() == topNKeyDesc.getPartitionKeyColumns().size()) {
        return;
    }
    LOG.debug("Pushing a copy of {} through {}", topNKey.getName(), reduceSink.getName());
    final TopNKeyDesc newTopNKeyDesc = topNKeyDesc.combine(commonKeyPrefix);
    pushdown((TopNKeyOperator) copyDown(reduceSink, newTopNKeyDesc));
    if (topNKeyDesc.getKeyColumns().size() == commonKeyPrefix.size()) {
        LOG.debug("Removing {} above {}", topNKey.getName(), reduceSink.getName());
        reduceSink.removeChildAndAdoptItsChildren(topNKey);
    }
}
Also used : TopNKeyDesc(org.apache.hadoop.hive.ql.plan.TopNKeyDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Aggregations

ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)50 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)31 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)31 ArrayList (java.util.ArrayList)29 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)20 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)19 HashMap (java.util.HashMap)18 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)17 LinkedHashMap (java.util.LinkedHashMap)16 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)16 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)16 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)14 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)14 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)13 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)12 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)11 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)11 LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)11 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)11