Search in sources :

Example 1 with OptimizeTezProcContext

use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.

the class ConvertJoinMapJoin method process.

@Override
public /*
   * (non-Javadoc) we should ideally not modify the tree we traverse. However,
   * since we need to walk the tree at any time when we modify the operator, we
   * might as well do it here.
   */
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    OptimizeTezProcContext context = (OptimizeTezProcContext) procCtx;
    hashTableLoadFactor = context.conf.getFloatVar(ConfVars.HIVEHASHTABLELOADFACTOR);
    fastHashTableAvailable = context.conf.getBoolVar(ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED);
    JoinOperator joinOp = (JoinOperator) nd;
    // adjust noconditional task size threshold for LLAP
    LlapClusterStateForCompile llapInfo = null;
    if ("llap".equalsIgnoreCase(context.conf.getVar(ConfVars.HIVE_EXECUTION_MODE))) {
        llapInfo = LlapClusterStateForCompile.getClusterInfo(context.conf);
        llapInfo.initClusterInfo();
    }
    MemoryMonitorInfo memoryMonitorInfo = getMemoryMonitorInfo(context.conf, llapInfo);
    joinOp.getConf().setMemoryMonitorInfo(memoryMonitorInfo);
    maxJoinMemory = memoryMonitorInfo.getAdjustedNoConditionalTaskSize();
    LOG.info("maxJoinMemory: {}", maxJoinMemory);
    hashMapDataStructure = HashMapDataStructureType.of(joinOp.getConf());
    TezBucketJoinProcCtx tezBucketJoinProcCtx = new TezBucketJoinProcCtx(context.conf);
    boolean hiveConvertJoin = context.conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN) & !context.parseContext.getDisableMapJoin();
    if (!hiveConvertJoin) {
        // we are just converting to a common merge join operator. The shuffle
        // join in map-reduce case.
        Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
        if (retval == null) {
            return retval;
        } else {
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }
    }
    // if we have traits, and table info is present in the traits, we know the
    // exact number of buckets. Else choose the largest number of estimated
    // reducers from the parent operators.
    int numBuckets = -1;
    if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
        numBuckets = estimateNumBuckets(joinOp, true);
    } else {
        numBuckets = 1;
    }
    LOG.info("Estimated number of buckets " + numBuckets);
    MapJoinConversion mapJoinConversion = getMapJoinConversion(joinOp, context, numBuckets, false, maxJoinMemory, true);
    if (mapJoinConversion == null) {
        Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
        if (retval == null) {
            return retval;
        } else {
            // only case is full outer join with SMB enabled which is not possible. Convert to regular
            // join.
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }
    }
    if (numBuckets > 1) {
        if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
            // Check if we are in LLAP, if so it needs to be determined if we should use BMJ or DPHJ
            if (llapInfo != null) {
                if (selectJoinForLlap(context, joinOp, tezBucketJoinProcCtx, llapInfo, mapJoinConversion, numBuckets)) {
                    return null;
                }
            } else if (convertJoinBucketMapJoin(joinOp, context, mapJoinConversion, tezBucketJoinProcCtx)) {
                return null;
            }
        }
    }
    // check if we can convert to map join no bucket scaling.
    LOG.info("Convert to non-bucketed map join");
    if (numBuckets != 1) {
        mapJoinConversion = getMapJoinConversion(joinOp, context, 1, false, maxJoinMemory, true);
    }
    if (mapJoinConversion == null) {
        // we are just converting to a common merge join operator. The shuffle
        // join in map-reduce case.
        fallbackToReduceSideJoin(joinOp, context);
        return null;
    }
    // Currently, this is a MJ path and we don's support FULL OUTER MapJoin yet.
    if (mapJoinConversion.getIsFullOuterJoin() && !mapJoinConversion.getIsFullOuterEnabledForMapJoin()) {
        fallbackToReduceSideJoin(joinOp, context);
        return null;
    }
    MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversion, true);
    if (mapJoinOp == null) {
        fallbackToReduceSideJoin(joinOp, context);
        return null;
    }
    // map join operator by default has no bucket cols and num of reduce sinks
    // reduced by 1
    mapJoinOp.setOpTraits(new OpTraits(null, -1, null, joinOp.getOpTraits().getNumReduceSinks()));
    preserveOperatorInfos(mapJoinOp, joinOp, context);
    // propagate this change till the next RS
    for (Operator<? extends OperatorDesc> childOp : mapJoinOp.getChildOperators()) {
        setAllChildrenTraits(childOp, mapJoinOp.getOpTraits());
    }
    return null;
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) MemoryMonitorInfo(org.apache.hadoop.hive.ql.exec.MemoryMonitorInfo) LlapClusterStateForCompile(org.apache.hadoop.hive.ql.optimizer.physical.LlapClusterStateForCompile) OpTraits(org.apache.hadoop.hive.ql.plan.OpTraits) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext)

Example 2 with OptimizeTezProcContext

use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.

the class DynamicPartitionPruningOptimization method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    ParseContext parseContext;
    if (procCtx instanceof OptimizeTezProcContext) {
        parseContext = ((OptimizeTezProcContext) procCtx).parseContext;
    } else if (procCtx instanceof OptimizeSparkProcContext) {
        parseContext = ((OptimizeSparkProcContext) procCtx).getParseContext();
    } else {
        throw new IllegalArgumentException("expected parseContext to be either " + "OptimizeTezProcContext or OptimizeSparkProcContext, but found " + procCtx.getClass().getName());
    }
    FilterOperator filter = (FilterOperator) nd;
    FilterDesc desc = filter.getConf();
    if (!parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING) && !parseContext.getConf().isSparkDPPAny()) {
        // nothing to do when the optimization is off
        return null;
    }
    TableScanOperator ts = null;
    if (filter.getParentOperators().size() == 1 && filter.getParentOperators().get(0) instanceof TableScanOperator) {
        ts = (TableScanOperator) filter.getParentOperators().get(0);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Parent: " + filter.getParentOperators().get(0));
        LOG.debug("Filter: " + desc.getPredicateString());
        LOG.debug("TableScan: " + ts);
    }
    DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext();
    // collect the dynamic pruning conditions
    removerContext.dynLists.clear();
    GenTezUtils.collectDynamicPruningConditions(desc.getPredicate(), removerContext);
    if (ts == null) {
        // Replace the synthetic predicate with true and bail out
        for (DynamicListContext ctx : removerContext) {
            ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
            replaceExprNode(ctx, desc, constNode);
        }
        return false;
    }
    boolean semiJoin = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION);
    if (HiveConf.getVar(parseContext.getConf(), HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
        // TODO HIVE-16862: Implement a similar feature like "hive.tez.dynamic.semijoin.reduction" in hive on spark
        semiJoin = false;
    }
    List<ExprNodeDesc> newBetweenNodes = new ArrayList<>();
    List<ExprNodeDesc> newBloomFilterNodes = new ArrayList<>();
    for (DynamicListContext ctx : removerContext) {
        if (ctx.desc.getTypeInfo().getCategory() != ObjectInspector.Category.PRIMITIVE) {
            // https://issues.apache.org/jira/browse/HIVE-24988
            continue;
        }
        String column = ExprNodeDescUtils.extractColName(ctx.parent);
        boolean semiJoinAttempted = false;
        ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
        if (column != null) {
            // Need unique IDs to refer to each min/max key value in the DynamicValueRegistry
            String keyBaseAlias = "";
            Table table = ts.getConf().getTableMetadata();
            boolean nonEquiJoin = isNonEquiJoin(ctx.parent);
            if (table != null && table.isPartitionKey(column) && !nonEquiJoin) {
                String columnType = table.getPartColByName(column).getType();
                String alias = ts.getConf().getAlias();
                PrunedPartitionList plist = parseContext.getPrunedPartitions(alias, ts);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("alias: " + alias);
                    LOG.debug("pruned partition list: ");
                    if (plist != null) {
                        for (Partition p : plist.getPartitions()) {
                            LOG.debug(p.getCompleteName());
                        }
                    }
                }
                // have been already filtered
                if (plist == null || plist.getPartitions().size() != 0) {
                    LOG.info("Dynamic partitioning: " + table.getCompleteName() + "." + column);
                    generateEventOperatorPlan(ctx, parseContext, ts, column, columnType, null);
                } else {
                    // all partitions have been statically removed
                    LOG.debug("No partition pruning necessary.");
                }
            } else if (table.isNonNative() && table.getStorageHandler().addDynamicSplitPruningEdge(table, ctx.parent)) {
                generateEventOperatorPlan(ctx, parseContext, ts, column, table.getCols().stream().filter(e -> e.getName().equals(column)).map(e -> e.getType()).findFirst().get(), ctx.parent);
            } else {
                // semijoin
                LOG.debug("Column " + column + " is not a partition column");
                if (semiJoin && !disableSemiJoinOptDueToExternalTable(parseContext.getConf(), ts, ctx) && ts.getConf().getFilterExpr() != null && !nonEquiJoin) {
                    LOG.debug("Initiate semijoin reduction for " + column + " (" + ts.getConf().getFilterExpr().getExprString());
                    StringBuilder internalColNameBuilder = new StringBuilder();
                    StringBuilder colNameBuilder = new StringBuilder();
                    // Apply best effort to fetch the correct table alias. If not
                    // found, fallback to old logic.
                    StringBuilder tabAliasBuilder = new StringBuilder();
                    if (getColumnInfo(ctx, internalColNameBuilder, colNameBuilder, tabAliasBuilder)) {
                        String colName = colNameBuilder.toString();
                        String tableAlias;
                        if (tabAliasBuilder.length() > 0) {
                            tableAlias = tabAliasBuilder.toString();
                        } else {
                            // falling back
                            Operator<?> op = ctx.generator;
                            while (!(op == null || op instanceof TableScanOperator)) {
                                op = op.getParentOperators().get(0);
                            }
                            tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias());
                        }
                        // Use the tableAlias to generate keyBaseAlias
                        keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + colName;
                        Map<String, List<SemiJoinHint>> hints = parseContext.getSemiJoinHints();
                        if (hints != null) {
                            // Create semijoin optimizations ONLY for hinted columns
                            semiJoinAttempted = processSemiJoinHints(parseContext, ctx, hints, tableAlias, internalColNameBuilder.toString(), colName, ts, keyBaseAlias);
                        } else {
                            // fallback to regular logic
                            semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias, internalColNameBuilder.toString(), colName, null);
                        }
                    }
                }
            }
            // we always remove the condition by replacing it with "true"
            if (semiJoinAttempted) {
                List<ExprNodeDesc> betweenArgs = new ArrayList<ExprNodeDesc>();
                // Do not invert between result
                betweenArgs.add(new ExprNodeConstantDesc(Boolean.FALSE));
                // add column expression here
                betweenArgs.add(ctx.parent.getChildren().get(0));
                betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_min", ctx.desc.getTypeInfo())));
                betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_max", ctx.desc.getTypeInfo())));
                ExprNodeDesc betweenNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("between").getGenericUDF(), betweenArgs);
                // add column expression for bloom filter
                List<ExprNodeDesc> bloomFilterArgs = new ArrayList<ExprNodeDesc>();
                bloomFilterArgs.add(ctx.parent.getChildren().get(0));
                bloomFilterArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_bloom_filter", TypeInfoFactory.binaryTypeInfo)));
                ExprNodeDesc bloomFilterNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("in_bloom_filter").getGenericUDF(), bloomFilterArgs);
                newBetweenNodes.add(betweenNode);
                newBloomFilterNodes.add(bloomFilterNode);
            }
        }
        replaceExprNode(ctx, desc, constNode);
    }
    if (!newBetweenNodes.isEmpty()) {
        // We need to add the new nodes: first the between nodes, then the bloom filters
        if (FunctionRegistry.isOpAnd(desc.getPredicate())) {
            // AND
            desc.getPredicate().getChildren().addAll(newBetweenNodes);
            desc.getPredicate().getChildren().addAll(newBloomFilterNodes);
        } else {
            List<ExprNodeDesc> andArgs = new ArrayList<>();
            andArgs.add(desc.getPredicate());
            andArgs.addAll(newBetweenNodes);
            andArgs.addAll(newBloomFilterNodes);
            ExprNodeGenericFuncDesc andExpr = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and").getGenericUDF(), andArgs);
            // Also pass in filter as tableScan filterExpr
            ts.getConf().setFilterExpr(andExpr);
            desc.setPredicate(andExpr);
        }
    }
    // if we pushed the predicate into the table scan we need to remove the
    // synthetic conditions there.
    cleanTableScanFilters(ts);
    return false;
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) Arrays(java.util.Arrays) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) SemanticAnalyzer(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer) CombineEquivalentWorkResolver(org.apache.hadoop.hive.ql.optimizer.spark.CombineEquivalentWorkResolver) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) FunctionRegistry(org.apache.hadoop.hive.ql.exec.FunctionRegistry) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ExprNodeDynamicValueDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext) RuntimeValuesInfo(org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo) Map(java.util.Map) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) NodeProcessorCtx(org.apache.hadoop.hive.ql.lib.NodeProcessorCtx) EnumSet(java.util.EnumSet) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) MetaStoreUtils(org.apache.hadoop.hive.metastore.utils.MetaStoreUtils) DynamicPartitionPrunerContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicPartitionPrunerContext) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) SparkUtilities(org.apache.hadoop.hive.ql.exec.spark.SparkUtilities) List(java.util.List) DynamicValue(org.apache.hadoop.hive.ql.plan.DynamicValue) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) GenericUDAFBloomFilterEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) GenericUDFIn(org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn) HashMap(java.util.HashMap) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SemanticNodeProcessor(org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor) Stack(java.util.Stack) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint) ArrayList(java.util.ArrayList) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) Operation(org.apache.hadoop.hive.ql.io.AcidUtils.Operation) PlanUtils(org.apache.hadoop.hive.ql.plan.PlanUtils) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) NullOrdering(org.apache.hadoop.hive.ql.util.NullOrdering) Logger(org.slf4j.Logger) TypeInfoFactory(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Table(org.apache.hadoop.hive.ql.metadata.Table) GenTezUtils(org.apache.hadoop.hive.ql.parse.GenTezUtils) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Node(org.apache.hadoop.hive.ql.lib.Node) Partition(org.apache.hadoop.hive.ql.metadata.Partition) SparkPartitionPruningSinkDesc(org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc) DynamicListContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext) OperatorFactory(org.apache.hadoop.hive.ql.exec.OperatorFactory) Preconditions(com.google.common.base.Preconditions) ExprNodeDescUtils(org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils) Collections(java.util.Collections) SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ExprNodeDynamicValueDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc) ArrayList(java.util.ArrayList) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) DynamicListContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) DynamicPartitionPrunerContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicPartitionPrunerContext) Map(java.util.Map) HashMap(java.util.HashMap) DynamicValue(org.apache.hadoop.hive.ql.plan.DynamicValue)

Example 3 with OptimizeTezProcContext

use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.

the class RemoveDynamicPruningBySize method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeTezProcContext context = (OptimizeTezProcContext) procContext;
    AppMasterEventOperator event = (AppMasterEventOperator) nd;
    AppMasterEventDesc desc = event.getConf();
    if (desc.getStatistics().getDataSize() > context.conf.getLongVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE) && (context.pruningOpsRemovedByPriorOpt.isEmpty() || !context.pruningOpsRemovedByPriorOpt.contains(event))) {
        context.pruningOpsRemovedByPriorOpt.add(event);
        GenTezUtils.removeBranch(event);
        // at this point we've found the fork in the op pipeline that has the pruning as a child plan.
        LOG.info("Disabling dynamic pruning for: " + ((DynamicPruningEventDesc) desc).getTableScan().getName() + ". Expected data size is too big: " + desc.getStatistics().getDataSize());
    }
    return false;
}
Also used : AppMasterEventDesc(org.apache.hadoop.hive.ql.plan.AppMasterEventDesc) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext)

Example 4 with OptimizeTezProcContext

use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.

the class SetReducerParallelism method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeTezProcContext context = (OptimizeTezProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
    int maxReducers = context.conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (context.visitedReduceSinks.contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.visitedReduceSinks.add(sink);
    if (desc.getNumReducers() <= 0) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            long numberOfBytes = 0;
            // we need to add up all the estimates from the siblings of this reduce sink
            for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                if (sibling.getStatistics() != null) {
                    numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
                } else {
                    LOG.warn("No stats available from: " + sibling);
                }
            }
            int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
            LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers);
            desc.setNumReducers(numReducers);
            final Collection<ExprNodeDescEqualityWrapper> keyCols = ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
            final Collection<ExprNodeDescEqualityWrapper> partCols = ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
            if (keyCols != null && keyCols.equals(partCols)) {
                desc.setReducerTraits(EnumSet.of(UNIFORM, AUTOPARALLEL));
            } else {
                desc.setReducerTraits(EnumSet.of(AUTOPARALLEL));
            }
        }
    } else {
        LOG.info("Number of reducers determined to be: " + desc.getNumReducers());
        // usually controlled by bucketing
        desc.setReducerTraits(EnumSet.of(FIXED));
    }
    return false;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext) ExprNodeDescEqualityWrapper(org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper)

Aggregations

OptimizeTezProcContext (org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext)4 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)2 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)2 Preconditions (com.google.common.base.Preconditions)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 EnumSet (java.util.EnumSet)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Stack (java.util.Stack)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 ConfVars (org.apache.hadoop.hive.conf.HiveConf.ConfVars)1 MetaStoreUtils (org.apache.hadoop.hive.metastore.utils.MetaStoreUtils)1 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)1 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)1 CommonJoinOperator (org.apache.hadoop.hive.ql.exec.CommonJoinOperator)1 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)1 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)1