Search in sources :

Example 46 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class TopNKeyPushdownProcessor method pushdownThroughLeftOuterJoin.

/**
 * Push through LOJ. If TopNKey expression refers fully to expressions from left input, push
 * with rewriting of expressions and remove from top of LOJ. If TopNKey expression has a prefix
 * that refers to expressions from left input, push with rewriting of those expressions and keep
 * on top of LOJ.
 *
 * @param topNKey TopNKey operator to push
 * @throws SemanticException when removeChildAndAdoptItsChildren was not successful
 */
private void pushdownThroughLeftOuterJoin(TopNKeyOperator topNKey) throws SemanticException {
    final TopNKeyDesc topNKeyDesc = topNKey.getConf();
    final CommonJoinOperator<? extends JoinDesc> join = (CommonJoinOperator<? extends JoinDesc>) topNKey.getParentOperators().get(0);
    final List<Operator<? extends OperatorDesc>> joinInputs = join.getParentOperators();
    final ReduceSinkOperator reduceSinkOperator = (ReduceSinkOperator) joinInputs.get(0);
    final ReduceSinkDesc reduceSinkDesc = reduceSinkOperator.getConf();
    CommonKeyPrefix commonKeyPrefix = CommonKeyPrefix.map(mapUntilColumnEquals(topNKeyDesc.getKeyColumns(), join.getColumnExprMap()), topNKeyDesc.getColumnSortOrder(), topNKeyDesc.getNullOrder(), reduceSinkDesc.getKeyCols(), reduceSinkDesc.getColumnExprMap(), reduceSinkDesc.getOrder(), reduceSinkDesc.getNullOrder());
    if (commonKeyPrefix.isEmpty() || commonKeyPrefix.size() == topNKeyDesc.getPartitionKeyColumns().size()) {
        return;
    }
    LOG.debug("Pushing a copy of {} through {} and {}", topNKey.getName(), join.getName(), reduceSinkOperator.getName());
    final TopNKeyDesc newTopNKeyDesc = topNKeyDesc.combine(commonKeyPrefix);
    pushdown((TopNKeyOperator) copyDown(reduceSinkOperator, newTopNKeyDesc));
    if (topNKeyDesc.getKeyColumns().size() == commonKeyPrefix.size()) {
        LOG.debug("Removing {} above {}", topNKey.getName(), join.getName());
        join.removeChildAndAdoptItsChildren(topNKey);
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TopNKeyDesc(org.apache.hadoop.hive.ql.plan.TopNKeyDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 47 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class SemanticAnalyzer method genJoinReduceSinkChild.

@SuppressWarnings("nls")
private Operator genJoinReduceSinkChild(ExprNodeDesc[] joinKeys, Operator<?> parent, String[] srcs, int tag) throws SemanticException {
    // dummy for backtracking
    Operator dummy = Operator.createDummy();
    dummy.setParentOperators(Arrays.asList(parent));
    RowResolver inputRR = opParseCtx.get(parent).getRowResolver();
    RowResolver outputRR = new RowResolver();
    List<String> outputColumns = new ArrayList<String>();
    List<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
    List<ExprNodeDesc> reduceKeysBack = new ArrayList<ExprNodeDesc>();
    // Compute join keys and store in reduceKeys
    for (ExprNodeDesc joinKey : joinKeys) {
        reduceKeys.add(joinKey);
        reduceKeysBack.add(ExprNodeDescUtils.backtrack(joinKey, dummy, parent));
    }
    // Walk over the input row resolver and copy in the output
    ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
    List<ColumnInfo> columns = inputRR.getColumnInfos();
    int[] index = new int[columns.size()];
    for (int i = 0; i < columns.size(); i++) {
        ColumnInfo colInfo = columns.get(i);
        String[] nm = inputRR.reverseLookup(colInfo.getInternalName());
        String[] nm2 = inputRR.getAlternateMappings(colInfo.getInternalName());
        ExprNodeDesc expr = new ExprNodeColumnDesc(colInfo);
        // backtrack can be null when input is script operator
        ExprNodeDesc exprBack = ExprNodeDescUtils.backtrack(expr, dummy, parent);
        int kindex;
        if (exprBack == null) {
            kindex = -1;
        } else if (ExprNodeDescUtils.isConstant(exprBack)) {
            kindex = reduceKeysBack.indexOf(exprBack);
        } else {
            kindex = ExprNodeDescUtils.indexOf(exprBack, reduceKeysBack);
        }
        if (kindex >= 0) {
            ColumnInfo newColInfo = new ColumnInfo(colInfo);
            String internalColName = Utilities.ReduceField.KEY + ".reducesinkkey" + kindex;
            newColInfo.setInternalName(internalColName);
            newColInfo.setTabAlias(nm[0]);
            outputRR.put(nm[0], nm[1], newColInfo);
            if (nm2 != null) {
                outputRR.addMappingOnly(nm2[0], nm2[1], newColInfo);
            }
            index[i] = kindex;
            continue;
        }
        index[i] = -reduceValues.size() - 1;
        String outputColName = getColumnInternalName(reduceValues.size());
        reduceValues.add(expr);
        ColumnInfo newColInfo = new ColumnInfo(colInfo);
        String internalColName = Utilities.ReduceField.VALUE + "." + outputColName;
        newColInfo.setInternalName(internalColName);
        newColInfo.setTabAlias(nm[0]);
        outputRR.put(nm[0], nm[1], newColInfo);
        if (nm2 != null) {
            outputRR.addMappingOnly(nm2[0], nm2[1], newColInfo);
        }
        outputColumns.add(outputColName);
    }
    dummy.setParentOperators(null);
    int numReds = -1;
    // Use only 1 reducer in case of cartesian product
    if (reduceKeys.size() == 0) {
        numReds = 1;
        String error = StrictChecks.checkCartesian(conf);
        if (error != null) {
            throw new SemanticException(error);
        }
    }
    ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumns, false, tag, reduceKeys.size(), numReds, AcidUtils.Operation.NOT_ACID, defaultNullOrder);
    Map<String, String> translatorMap = new HashMap<String, String>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
    for (int i = 0; i < keyColNames.size(); i++) {
        String oldName = keyColNames.get(i);
        String newName = Utilities.ReduceField.KEY + "." + oldName;
        colExprMap.put(newName, reduceKeys.get(i));
        translatorMap.put(oldName, newName);
    }
    List<String> valColNames = rsDesc.getOutputValueColumnNames();
    for (int i = 0; i < valColNames.size(); i++) {
        String oldName = valColNames.get(i);
        String newName = Utilities.ReduceField.VALUE + "." + oldName;
        colExprMap.put(newName, reduceValues.get(i));
        translatorMap.put(oldName, newName);
    }
    RowSchema defaultRs = new RowSchema(outputRR.getColumnInfos());
    List<ColumnInfo> newColumnInfos = new ArrayList<ColumnInfo>();
    for (ColumnInfo ci : outputRR.getColumnInfos()) {
        if (translatorMap.containsKey(ci.getInternalName())) {
            ci = new ColumnInfo(ci);
            ci.setInternalName(translatorMap.get(ci.getInternalName()));
        }
        newColumnInfos.add(ci);
    }
    ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(newColumnInfos), parent), outputRR);
    rsOp.setValueIndex(index);
    rsOp.setColumnExprMap(colExprMap);
    rsOp.setInputAliases(srcs);
    return rsOp;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 48 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class SetSparkReducerParallelism method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    Set<ReduceSinkOperator> parentSinks = null;
    int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (!useOpStats) {
        parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
        parentSinks.remove(sink);
        if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
            // We haven't processed all the parent sinks, and we need
            // them to be done in order to compute the parallelism for this sink.
            // In this case, skip. We should visit this again from another path.
            LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
            return false;
        }
    }
    if (context.getVisitedReduceSinks().contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.getVisitedReduceSinks().add(sink);
    if (needSetParallelism(sink, context.getConf())) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            // If it's a FileSink to bucketed files, use the bucket count as the reducer number
            FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
            if (fso != null) {
                String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
                int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
                if (numBuckets > 0) {
                    LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
                    desc.setNumReducers(numBuckets);
                    return false;
                }
            }
            if (useOpStats || parentSinks.isEmpty()) {
                long numberOfBytes = 0;
                if (useOpStats) {
                    // we need to add up all the estimates from the siblings of this reduce sink
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        if (sibling.getStatistics() != null) {
                            numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
                            }
                        } else {
                            LOG.warn("No stats available from: " + sibling);
                        }
                    }
                } else {
                    // we should use TS stats to infer parallelism
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
                        for (TableScanOperator source : sources) {
                            if (source.getStatistics() != null) {
                                numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
                                }
                            } else {
                                LOG.warn("No stats available from table source: " + source);
                            }
                        }
                    }
                    LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
                }
                // Divide it by 2 so that we can have more reducers
                long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
                int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
                getSparkMemoryAndCores(context);
                if (sparkMemoryAndCores != null && sparkMemoryAndCores.getLeft() > 0 && sparkMemoryAndCores.getRight() > 0) {
                    // warn the user if bytes per reducer is much larger than memory per task
                    if ((double) sparkMemoryAndCores.getLeft() / bytesPerReducer < 0.5) {
                        LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
                    }
                    // If there are more cores, use the number of cores
                    numReducers = Math.max(numReducers, sparkMemoryAndCores.getRight());
                }
                numReducers = Math.min(numReducers, maxReducers);
                LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
                desc.setNumReducers(numReducers);
            } else {
                // Use the maximum parallelism from all parent reduce sinks
                int numberOfReducers = 0;
                for (ReduceSinkOperator parent : parentSinks) {
                    numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
                }
                desc.setNumReducers(numberOfReducers);
                LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
            }
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
            if (keyCols != null && keyCols.equals(partCols)) {
                desc.setReducerTraits(EnumSet.of(UNIFORM));
            }
        }
    } else {
        LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
    }
    return false;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 49 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class TestSharedWorkOptimizer method ensureDeduplicate.

private void ensureDeduplicate(EnumSet<ReduceSinkDesc.ReducerTraits> traits1, int numReducers1, EnumSet<ReduceSinkDesc.ReducerTraits> traits2, int numReducers2, EnumSet<ReduceSinkDesc.ReducerTraits> expectedTraits, int expectedNumReducers) {
    ReduceSinkDesc rsConf1;
    ReduceSinkDesc rsConf2;
    boolean deduplicated;
    rsConf1 = new ReduceSinkDesc();
    rsConf1.setReducerTraits(traits1);
    rsConf1.setNumReducers(numReducers1);
    rsConf2 = new ReduceSinkDesc();
    rsConf2.setReducerTraits(traits2);
    rsConf2.setNumReducers(numReducers2);
    deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf1, rsConf2);
    assertTrue(deduplicated);
    assertEquals(expectedTraits, rsConf1.getReducerTraits());
    assertEquals(expectedNumReducers, rsConf1.getNumReducers());
    rsConf1 = new ReduceSinkDesc();
    rsConf1.setReducerTraits(traits1);
    rsConf1.setNumReducers(numReducers1);
    rsConf2 = new ReduceSinkDesc();
    rsConf2.setReducerTraits(traits2);
    rsConf2.setNumReducers(numReducers2);
    deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf2, rsConf1);
    assertTrue(deduplicated);
    assertEquals(expectedTraits, rsConf2.getReducerTraits());
    assertEquals(expectedNumReducers, rsConf2.getNumReducers());
}
Also used : ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 50 with ReduceSinkDesc

use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.

the class TestSharedWorkOptimizer method ensureNotDeduplicate.

private void ensureNotDeduplicate(EnumSet<ReduceSinkDesc.ReducerTraits> traits1, int numReducers1, EnumSet<ReduceSinkDesc.ReducerTraits> traits2, int numReducers2) {
    ReduceSinkDesc rsConf1;
    ReduceSinkDesc rsConf2;
    boolean deduplicated;
    rsConf1 = new ReduceSinkDesc();
    rsConf1.setReducerTraits(traits1);
    rsConf1.setNumReducers(numReducers1);
    rsConf2 = new ReduceSinkDesc();
    rsConf2.setReducerTraits(traits2);
    rsConf2.setNumReducers(numReducers2);
    deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf1, rsConf2);
    assertFalse(deduplicated);
    rsConf1 = new ReduceSinkDesc();
    rsConf1.setReducerTraits(traits1);
    rsConf1.setNumReducers(numReducers1);
    rsConf2 = new ReduceSinkDesc();
    rsConf2.setReducerTraits(traits2);
    rsConf2.setNumReducers(numReducers2);
    deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf1, rsConf2);
    assertFalse(deduplicated);
}
Also used : ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Aggregations

ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)50 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)31 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)31 ArrayList (java.util.ArrayList)29 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)20 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)19 HashMap (java.util.HashMap)18 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)17 LinkedHashMap (java.util.LinkedHashMap)16 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)16 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)16 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)14 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)14 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)13 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)12 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)11 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)11 LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)11 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)11