Search in sources :

Example 76 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SemanticAnalyzer method genGroupByPlan1ReduceMultiGBY.

@SuppressWarnings({ "nls" })
private Operator genGroupByPlan1ReduceMultiGBY(List<String> dests, QB qb, Operator input, Map<String, Operator> aliasToOpInfo) throws SemanticException {
    QBParseInfo parseInfo = qb.getParseInfo();
    ExprNodeDesc previous = null;
    Operator selectInput = input;
    // In order to facilitate partition pruning, or the where clauses together and put them at the
    // top of the operator tree, this could also reduce the amount of data going to the reducer
    List<ExprNodeDesc.ExprNodeDescEqualityWrapper> whereExpressions = new ArrayList<ExprNodeDesc.ExprNodeDescEqualityWrapper>();
    for (String dest : dests) {
        ObjectPair<List<ASTNode>, List<Long>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
        List<Long> groupingSets = grpByExprsGroupingSets.getSecond();
        if (!groupingSets.isEmpty()) {
            throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_AGGR_NOMAPAGGR_MULTIGBY.getMsg());
        }
        ASTNode whereExpr = parseInfo.getWhrForClause(dest);
        if (whereExpr != null) {
            OpParseContext inputCtx = opParseCtx.get(input);
            RowResolver inputRR = inputCtx.getRowResolver();
            ExprNodeDesc current = genExprNodeDesc((ASTNode) whereExpr.getChild(0), inputRR);
            // Check the list of where expressions already added so they aren't duplicated
            ExprNodeDesc.ExprNodeDescEqualityWrapper currentWrapped = new ExprNodeDesc.ExprNodeDescEqualityWrapper(current);
            if (!whereExpressions.contains(currentWrapped)) {
                whereExpressions.add(currentWrapped);
            } else {
                continue;
            }
            if (previous == null) {
                // If this is the first expression
                previous = current;
                continue;
            }
            GenericUDFOPOr or = new GenericUDFOPOr();
            List<ExprNodeDesc> expressions = new ArrayList<ExprNodeDesc>(2);
            expressions.add(current);
            expressions.add(previous);
            ExprNodeDesc orExpr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, or, expressions);
            previous = orExpr;
        } else {
            // If an expression does not have a where clause, there can be no common filter
            previous = null;
            break;
        }
    }
    if (previous != null) {
        OpParseContext inputCtx = opParseCtx.get(input);
        RowResolver inputRR = inputCtx.getRowResolver();
        FilterDesc orFilterDesc = new FilterDesc(previous, false);
        orFilterDesc.setGenerated(true);
        selectInput = putOpInsertMap(OperatorFactory.getAndMakeChild(orFilterDesc, new RowSchema(inputRR.getColumnInfos()), input), inputRR);
    }
    // insert a select operator here used by the ColumnPruner to reduce
    // the data to shuffle
    Operator select = genSelectAllDesc(selectInput);
    // Generate ReduceSinkOperator
    ReduceSinkOperator reduceSinkOperatorInfo = genCommonGroupByPlanReduceSinkOperator(qb, dests, select);
    // It is assumed throughout the code that a reducer has a single child, add a
    // ForwardOperator so that we can add multiple filter/group by operators as children
    RowResolver reduceSinkOperatorInfoRR = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
    Operator forwardOp = putOpInsertMap(OperatorFactory.getAndMakeChild(new ForwardDesc(), new RowSchema(reduceSinkOperatorInfoRR.getColumnInfos()), reduceSinkOperatorInfo), reduceSinkOperatorInfoRR);
    Operator curr = forwardOp;
    for (String dest : dests) {
        curr = forwardOp;
        if (parseInfo.getWhrForClause(dest) != null) {
            ASTNode whereExpr = qb.getParseInfo().getWhrForClause(dest);
            curr = genFilterPlan((ASTNode) whereExpr.getChild(0), qb, forwardOp, aliasToOpInfo, false, true);
        }
        // Generate GroupbyOperator
        Operator groupByOperatorInfo = genGroupByPlanGroupByOperator(parseInfo, dest, curr, reduceSinkOperatorInfo, GroupByDesc.Mode.COMPLETE, null);
        // TODO: should we pass curr instead of null?
        curr = genPostGroupByBodyPlan(groupByOperatorInfo, dest, qb, aliasToOpInfo, null);
    }
    return curr;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ArrayList(java.util.ArrayList) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) ForwardDesc(org.apache.hadoop.hive.ql.plan.ForwardDesc) LateralViewForwardDesc(org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) GenericUDFOPOr(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 77 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SemanticAnalyzer method genJoinOperatorChildren.

private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Operator[] right, HashSet<Integer> omitOpts, ExprNodeDesc[][] joinKeys) throws SemanticException {
    RowResolver outputRR = new RowResolver();
    ArrayList<String> outputColumnNames = new ArrayList<String>();
    // all children are base classes
    Operator<?>[] rightOps = new Operator[right.length];
    int outputPos = 0;
    Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
    HashMap<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    HashMap<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
    HashMap<Byte, List<ExprNodeDesc>> filterMap = new HashMap<Byte, List<ExprNodeDesc>>();
    // Only used for semijoin with residual predicates
    List<ColumnInfo> topSelectInputColumns = new ArrayList<>();
    for (int pos = 0; pos < right.length; ++pos) {
        Operator<?> input = right[pos] == null ? left : right[pos];
        if (input == null) {
            input = left;
        }
        ReduceSinkOperator rs = (ReduceSinkOperator) input;
        if (rs.getNumParent() != 1) {
            throw new SemanticException("RS should have single parent");
        }
        Operator<?> parent = rs.getParentOperators().get(0);
        ReduceSinkDesc rsDesc = (ReduceSinkDesc) (input.getConf());
        int[] index = rs.getValueIndex();
        ArrayList<ExprNodeDesc> valueDesc = new ArrayList<ExprNodeDesc>();
        ArrayList<ExprNodeDesc> filterDesc = new ArrayList<ExprNodeDesc>();
        Byte tag = (byte) rsDesc.getTag();
        // we will add a Select on top of the join
        if (omitOpts != null && omitOpts.contains(pos) && join.getPostJoinFilters().size() == 0) {
            exprMap.put(tag, valueDesc);
            filterMap.put(tag, filterDesc);
            rightOps[pos] = input;
            continue;
        }
        List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
        List<String> valColNames = rsDesc.getOutputValueColumnNames();
        // prepare output descriptors for the input opt
        RowResolver inputRR = opParseCtx.get(input).getRowResolver();
        RowResolver parentRR = opParseCtx.get(parent).getRowResolver();
        posToAliasMap.put(pos, new HashSet<String>(inputRR.getTableNames()));
        List<ColumnInfo> columns = parentRR.getColumnInfos();
        for (int i = 0; i < index.length; i++) {
            ColumnInfo prev = columns.get(i);
            String[] nm = parentRR.reverseLookup(prev.getInternalName());
            String[] nm2 = parentRR.getAlternateMappings(prev.getInternalName());
            if (outputRR.get(nm[0], nm[1]) != null) {
                continue;
            }
            ColumnInfo info = new ColumnInfo(prev);
            String field;
            if (index[i] >= 0) {
                field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]);
            } else {
                field = Utilities.ReduceField.VALUE + "." + valColNames.get(-index[i] - 1);
            }
            String internalName = getColumnInternalName(outputColumnNames.size());
            ExprNodeColumnDesc desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), info.getIsVirtualCol());
            info.setInternalName(internalName);
            colExprMap.put(internalName, desc);
            outputRR.put(nm[0], nm[1], info);
            if (nm2 != null) {
                outputRR.addMappingOnly(nm2[0], nm2[1], info);
            }
            valueDesc.add(desc);
            outputColumnNames.add(internalName);
            reversedExprs.put(internalName, tag);
            // Populate semijoin select if needed
            if (omitOpts == null || !omitOpts.contains(pos)) {
                topSelectInputColumns.add(info);
            }
        }
        for (ASTNode cond : join.getFilters().get(tag)) {
            filterDesc.add(genExprNodeDesc(cond, inputRR));
        }
        exprMap.put(tag, valueDesc);
        filterMap.put(tag, filterDesc);
        rightOps[pos] = input;
    }
    JoinCondDesc[] joinCondns = new JoinCondDesc[join.getJoinCond().length];
    for (int i = 0; i < join.getJoinCond().length; i++) {
        JoinCond condn = join.getJoinCond()[i];
        joinCondns[i] = new JoinCondDesc(condn);
    }
    JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, join.getNoOuterJoin(), joinCondns, filterMap, joinKeys, null);
    desc.setReversedExprs(reversedExprs);
    desc.setFilterMap(join.getFilterMap());
    // Add filters that apply to more than one input
    if (join.getPostJoinFilters().size() != 0 && (!join.getNoOuterJoin() || !join.getNoSemiJoin() || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) {
        LOG.debug("Generate JOIN with post-filtering conditions");
        List<ExprNodeDesc> residualFilterExprs = new ArrayList<ExprNodeDesc>();
        for (ASTNode cond : join.getPostJoinFilters()) {
            residualFilterExprs.add(genExprNodeDesc(cond, outputRR, false, isCBOExecuted()));
        }
        desc.setResidualFilterExprs(residualFilterExprs);
        // Clean post-conditions
        join.getPostJoinFilters().clear();
    }
    JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(getOpContext(), desc, new RowSchema(outputRR.getColumnInfos()), rightOps);
    joinOp.setColumnExprMap(colExprMap);
    joinOp.setPosToAliasMap(posToAliasMap);
    if (join.getNullSafes() != null) {
        boolean[] nullsafes = new boolean[join.getNullSafes().size()];
        for (int i = 0; i < nullsafes.length; i++) {
            nullsafes[i] = join.getNullSafes().get(i);
        }
        desc.setNullSafes(nullsafes);
    }
    Operator<?> topOp = putOpInsertMap(joinOp, outputRR);
    if (omitOpts != null && !omitOpts.isEmpty() && desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) {
        // Adding a select operator to top of semijoin to ensure projection of only correct columns
        final List<ExprNodeDesc> topSelectExprs = new ArrayList<>();
        final List<String> topSelectOutputColNames = new ArrayList<>();
        final RowResolver topSelectRR = new RowResolver();
        final Map<String, ExprNodeDesc> topSelectColExprMap = new HashMap<String, ExprNodeDesc>();
        for (ColumnInfo colInfo : topSelectInputColumns) {
            ExprNodeColumnDesc columnExpr = new ExprNodeColumnDesc(colInfo);
            topSelectExprs.add(columnExpr);
            topSelectOutputColNames.add(colInfo.getInternalName());
            topSelectColExprMap.put(colInfo.getInternalName(), columnExpr);
            String[] nm = outputRR.reverseLookup(columnExpr.getColumn());
            String[] nm2 = outputRR.getAlternateMappings(columnExpr.getColumn());
            topSelectRR.put(nm[0], nm[1], colInfo);
            if (nm2 != null) {
                topSelectRR.addMappingOnly(nm2[0], nm2[1], colInfo);
            }
        }
        final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames);
        topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR);
        topOp.setColumnExprMap(topSelectColExprMap);
    }
    return topOp;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) HashSet(java.util.HashSet) ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) Set(java.util.Set) TreeSet(java.util.TreeSet) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) LateralViewJoinDesc(org.apache.hadoop.hive.ql.plan.LateralViewJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 78 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SetSparkReducerParallelism method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    Set<ReduceSinkOperator> parentSinks = null;
    int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (!useOpStats) {
        parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
        parentSinks.remove(sink);
        if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
            // We haven't processed all the parent sinks, and we need
            // them to be done in order to compute the parallelism for this sink.
            // In this case, skip. We should visit this again from another path.
            LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
            return false;
        }
    }
    if (context.getVisitedReduceSinks().contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.getVisitedReduceSinks().add(sink);
    if (needSetParallelism(sink, context.getConf())) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            // If it's a FileSink to bucketed files, use the bucket count as the reducer number
            FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
            if (fso != null) {
                String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
                int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
                if (numBuckets > 0) {
                    LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
                    desc.setNumReducers(numBuckets);
                    return false;
                }
            }
            if (useOpStats || parentSinks.isEmpty()) {
                long numberOfBytes = 0;
                if (useOpStats) {
                    // we need to add up all the estimates from the siblings of this reduce sink
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        if (sibling.getStatistics() != null) {
                            numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
                            }
                        } else {
                            LOG.warn("No stats available from: " + sibling);
                        }
                    }
                } else {
                    // we should use TS stats to infer parallelism
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
                        for (TableScanOperator source : sources) {
                            if (source.getStatistics() != null) {
                                numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
                                }
                            } else {
                                LOG.warn("No stats available from table source: " + source);
                            }
                        }
                    }
                    LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
                }
                // Divide it by 2 so that we can have more reducers
                long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
                int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
                getSparkMemoryAndCores(context);
                if (sparkMemoryAndCores != null && sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) {
                    // warn the user if bytes per reducer is much larger than memory per task
                    if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) {
                        LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
                    }
                    // If there are more cores, use the number of cores
                    numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond());
                }
                numReducers = Math.min(numReducers, maxReducers);
                LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
                desc.setNumReducers(numReducers);
            } else {
                // Use the maximum parallelism from all parent reduce sinks
                int numberOfReducers = 0;
                for (ReduceSinkOperator parent : parentSinks) {
                    numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
                }
                desc.setNumReducers(numberOfReducers);
                LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
            }
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
            if (keyCols != null && keyCols.equals(partCols)) {
                desc.setReducerTraits(EnumSet.of(UNIFORM));
            }
        }
    } else {
        LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
    }
    return false;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 79 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SparkSMBJoinHintOptimizer method removeSmallTableReduceSink.

/**
 * In bucket mapjoin, there are ReduceSinks that mark a small table parent (Reduce Sink are removed from big-table).
 * In SMB join these are not expected for any parents, either from small or big tables.
 * @param mapJoinOp
 */
@SuppressWarnings("unchecked")
private void removeSmallTableReduceSink(MapJoinOperator mapJoinOp) {
    SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
    List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
    for (int i = 0; i < parentOperators.size(); i++) {
        Operator<? extends OperatorDesc> par = parentOperators.get(i);
        if (i != smbJoinDesc.getPosBigTable()) {
            if (par instanceof ReduceSinkOperator) {
                List<Operator<? extends OperatorDesc>> grandParents = par.getParentOperators();
                Preconditions.checkArgument(grandParents.size() == 1, "AssertionError: expect # of parents to be 1, but was " + grandParents.size());
                Operator<? extends OperatorDesc> grandParent = grandParents.get(0);
                grandParent.removeChild(par);
                grandParent.setChildOperators(Utilities.makeList(mapJoinOp));
                mapJoinOp.getParentOperators().set(i, grandParent);
            }
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 80 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class ReduceSinkDeDuplicationUtils method merge.

// for JOIN-RS case, it's not possible generally to merge if child has
// less key/partition columns than parents
public static boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException {
    List<Operator<?>> parents = pJoin.getParentOperators();
    ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]);
    ReduceSinkDesc cRSc = cRS.getConf();
    for (ReduceSinkOperator pRSNs : pRSs) {
        ReduceSinkDesc pRSNc = pRSNs.getConf();
        if (cRSc.getKeyCols().size() != pRSNc.getKeyCols().size()) {
            return false;
        }
        if (cRSc.getPartitionCols().size() != pRSNc.getPartitionCols().size()) {
            return false;
        }
        Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRSNc.getNumReducers());
        if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) {
            return false;
        }
        Integer moveRSOrderTo = checkOrder(true, cRSc.getOrder(), pRSNc.getOrder(), cRSc.getNullOrder(), pRSNc.getNullOrder());
        if (moveRSOrderTo == null) {
            return false;
        }
    }
    boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin);
    int cKeySize = cRSc.getKeyCols().size();
    for (int i = 0; i < cKeySize; i++) {
        ExprNodeDesc cexpr = cRSc.getKeyCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
            pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
            return false;
        }
    }
    int cPartSize = cRSc.getPartitionCols().size();
    for (int i = 0; i < cPartSize; i++) {
        ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
            pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
            return false;
        }
    }
    for (ReduceSinkOperator pRS : pRSs) {
        pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) PTFOperator(org.apache.hadoop.hive.ql.exec.PTFOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)86 Operator (org.apache.hadoop.hive.ql.exec.Operator)50 ArrayList (java.util.ArrayList)48 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)45 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)35 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)31 HashMap (java.util.HashMap)29 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)28 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)27 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)26 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)24 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)23 List (java.util.List)19 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)19 LinkedHashMap (java.util.LinkedHashMap)18 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)18 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)18 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)15