Search in sources :

Example 1 with DynamicListContext

use of org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext in project hive by apache.

the class DynamicPartitionPruningOptimization method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    ParseContext parseContext;
    if (procCtx instanceof OptimizeTezProcContext) {
        parseContext = ((OptimizeTezProcContext) procCtx).parseContext;
    } else if (procCtx instanceof OptimizeSparkProcContext) {
        parseContext = ((OptimizeSparkProcContext) procCtx).getParseContext();
    } else {
        throw new IllegalArgumentException("expected parseContext to be either " + "OptimizeTezProcContext or OptimizeSparkProcContext, but found " + procCtx.getClass().getName());
    }
    FilterOperator filter = (FilterOperator) nd;
    FilterDesc desc = filter.getConf();
    if (!parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING) && !parseContext.getConf().isSparkDPPAny()) {
        // nothing to do when the optimization is off
        return null;
    }
    TableScanOperator ts = null;
    if (filter.getParentOperators().size() == 1 && filter.getParentOperators().get(0) instanceof TableScanOperator) {
        ts = (TableScanOperator) filter.getParentOperators().get(0);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Parent: " + filter.getParentOperators().get(0));
        LOG.debug("Filter: " + desc.getPredicateString());
        LOG.debug("TableScan: " + ts);
    }
    DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext();
    // collect the dynamic pruning conditions
    removerContext.dynLists.clear();
    GenTezUtils.collectDynamicPruningConditions(desc.getPredicate(), removerContext);
    if (ts == null) {
        // Replace the synthetic predicate with true and bail out
        for (DynamicListContext ctx : removerContext) {
            ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
            replaceExprNode(ctx, desc, constNode);
        }
        return false;
    }
    boolean semiJoin = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION);
    if (HiveConf.getVar(parseContext.getConf(), HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
        // TODO HIVE-16862: Implement a similar feature like "hive.tez.dynamic.semijoin.reduction" in hive on spark
        semiJoin = false;
    }
    List<ExprNodeDesc> newBetweenNodes = new ArrayList<>();
    List<ExprNodeDesc> newBloomFilterNodes = new ArrayList<>();
    for (DynamicListContext ctx : removerContext) {
        if (ctx.desc.getTypeInfo().getCategory() != ObjectInspector.Category.PRIMITIVE) {
            // https://issues.apache.org/jira/browse/HIVE-24988
            continue;
        }
        String column = ExprNodeDescUtils.extractColName(ctx.parent);
        boolean semiJoinAttempted = false;
        ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
        if (column != null) {
            // Need unique IDs to refer to each min/max key value in the DynamicValueRegistry
            String keyBaseAlias = "";
            Table table = ts.getConf().getTableMetadata();
            boolean nonEquiJoin = isNonEquiJoin(ctx.parent);
            if (table != null && table.isPartitionKey(column) && !nonEquiJoin) {
                String columnType = table.getPartColByName(column).getType();
                String alias = ts.getConf().getAlias();
                PrunedPartitionList plist = parseContext.getPrunedPartitions(alias, ts);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("alias: " + alias);
                    LOG.debug("pruned partition list: ");
                    if (plist != null) {
                        for (Partition p : plist.getPartitions()) {
                            LOG.debug(p.getCompleteName());
                        }
                    }
                }
                // have been already filtered
                if (plist == null || plist.getPartitions().size() != 0) {
                    LOG.info("Dynamic partitioning: " + table.getCompleteName() + "." + column);
                    generateEventOperatorPlan(ctx, parseContext, ts, column, columnType, null);
                } else {
                    // all partitions have been statically removed
                    LOG.debug("No partition pruning necessary.");
                }
            } else if (table.isNonNative() && table.getStorageHandler().addDynamicSplitPruningEdge(table, ctx.parent)) {
                generateEventOperatorPlan(ctx, parseContext, ts, column, table.getCols().stream().filter(e -> e.getName().equals(column)).map(e -> e.getType()).findFirst().get(), ctx.parent);
            } else {
                // semijoin
                LOG.debug("Column " + column + " is not a partition column");
                if (semiJoin && !disableSemiJoinOptDueToExternalTable(parseContext.getConf(), ts, ctx) && ts.getConf().getFilterExpr() != null && !nonEquiJoin) {
                    LOG.debug("Initiate semijoin reduction for " + column + " (" + ts.getConf().getFilterExpr().getExprString());
                    StringBuilder internalColNameBuilder = new StringBuilder();
                    StringBuilder colNameBuilder = new StringBuilder();
                    // Apply best effort to fetch the correct table alias. If not
                    // found, fallback to old logic.
                    StringBuilder tabAliasBuilder = new StringBuilder();
                    if (getColumnInfo(ctx, internalColNameBuilder, colNameBuilder, tabAliasBuilder)) {
                        String colName = colNameBuilder.toString();
                        String tableAlias;
                        if (tabAliasBuilder.length() > 0) {
                            tableAlias = tabAliasBuilder.toString();
                        } else {
                            // falling back
                            Operator<?> op = ctx.generator;
                            while (!(op == null || op instanceof TableScanOperator)) {
                                op = op.getParentOperators().get(0);
                            }
                            tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias());
                        }
                        // Use the tableAlias to generate keyBaseAlias
                        keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + colName;
                        Map<String, List<SemiJoinHint>> hints = parseContext.getSemiJoinHints();
                        if (hints != null) {
                            // Create semijoin optimizations ONLY for hinted columns
                            semiJoinAttempted = processSemiJoinHints(parseContext, ctx, hints, tableAlias, internalColNameBuilder.toString(), colName, ts, keyBaseAlias);
                        } else {
                            // fallback to regular logic
                            semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias, internalColNameBuilder.toString(), colName, null);
                        }
                    }
                }
            }
            // we always remove the condition by replacing it with "true"
            if (semiJoinAttempted) {
                List<ExprNodeDesc> betweenArgs = new ArrayList<ExprNodeDesc>();
                // Do not invert between result
                betweenArgs.add(new ExprNodeConstantDesc(Boolean.FALSE));
                // add column expression here
                betweenArgs.add(ctx.parent.getChildren().get(0));
                betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_min", ctx.desc.getTypeInfo())));
                betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_max", ctx.desc.getTypeInfo())));
                ExprNodeDesc betweenNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("between").getGenericUDF(), betweenArgs);
                // add column expression for bloom filter
                List<ExprNodeDesc> bloomFilterArgs = new ArrayList<ExprNodeDesc>();
                bloomFilterArgs.add(ctx.parent.getChildren().get(0));
                bloomFilterArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_bloom_filter", TypeInfoFactory.binaryTypeInfo)));
                ExprNodeDesc bloomFilterNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("in_bloom_filter").getGenericUDF(), bloomFilterArgs);
                newBetweenNodes.add(betweenNode);
                newBloomFilterNodes.add(bloomFilterNode);
            }
        }
        replaceExprNode(ctx, desc, constNode);
    }
    if (!newBetweenNodes.isEmpty()) {
        // We need to add the new nodes: first the between nodes, then the bloom filters
        if (FunctionRegistry.isOpAnd(desc.getPredicate())) {
            // AND
            desc.getPredicate().getChildren().addAll(newBetweenNodes);
            desc.getPredicate().getChildren().addAll(newBloomFilterNodes);
        } else {
            List<ExprNodeDesc> andArgs = new ArrayList<>();
            andArgs.add(desc.getPredicate());
            andArgs.addAll(newBetweenNodes);
            andArgs.addAll(newBloomFilterNodes);
            ExprNodeGenericFuncDesc andExpr = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and").getGenericUDF(), andArgs);
            // Also pass in filter as tableScan filterExpr
            ts.getConf().setFilterExpr(andExpr);
            desc.setPredicate(andExpr);
        }
    }
    // if we pushed the predicate into the table scan we need to remove the
    // synthetic conditions there.
    cleanTableScanFilters(ts);
    return false;
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) Arrays(java.util.Arrays) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) SemanticAnalyzer(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer) CombineEquivalentWorkResolver(org.apache.hadoop.hive.ql.optimizer.spark.CombineEquivalentWorkResolver) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) FunctionRegistry(org.apache.hadoop.hive.ql.exec.FunctionRegistry) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ExprNodeDynamicValueDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext) RuntimeValuesInfo(org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo) Map(java.util.Map) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) NodeProcessorCtx(org.apache.hadoop.hive.ql.lib.NodeProcessorCtx) EnumSet(java.util.EnumSet) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) MetaStoreUtils(org.apache.hadoop.hive.metastore.utils.MetaStoreUtils) DynamicPartitionPrunerContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicPartitionPrunerContext) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) SparkUtilities(org.apache.hadoop.hive.ql.exec.spark.SparkUtilities) List(java.util.List) DynamicValue(org.apache.hadoop.hive.ql.plan.DynamicValue) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) GenericUDAFBloomFilterEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) Mode(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) GenericUDFIn(org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn) HashMap(java.util.HashMap) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SemanticNodeProcessor(org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor) Stack(java.util.Stack) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint) ArrayList(java.util.ArrayList) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) Operation(org.apache.hadoop.hive.ql.io.AcidUtils.Operation) PlanUtils(org.apache.hadoop.hive.ql.plan.PlanUtils) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) NullOrdering(org.apache.hadoop.hive.ql.util.NullOrdering) Logger(org.slf4j.Logger) TypeInfoFactory(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Table(org.apache.hadoop.hive.ql.metadata.Table) GenTezUtils(org.apache.hadoop.hive.ql.parse.GenTezUtils) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Node(org.apache.hadoop.hive.ql.lib.Node) Partition(org.apache.hadoop.hive.ql.metadata.Partition) SparkPartitionPruningSinkDesc(org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc) DynamicListContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext) OperatorFactory(org.apache.hadoop.hive.ql.exec.OperatorFactory) Preconditions(com.google.common.base.Preconditions) ExprNodeDescUtils(org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils) Collections(java.util.Collections) SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ExprNodeDynamicValueDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc) ArrayList(java.util.ArrayList) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) DynamicListContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) DynamicPartitionPrunerContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicPartitionPrunerContext) Map(java.util.Map) HashMap(java.util.HashMap) DynamicValue(org.apache.hadoop.hive.ql.plan.DynamicValue)

Example 2 with DynamicListContext

use of org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext in project hive by apache.

the class DynamicPartitionPruningOptimization method cleanTableScanFilters.

private void cleanTableScanFilters(TableScanOperator ts) throws SemanticException {
    if (ts == null || ts.getConf() == null || ts.getConf().getFilterExpr() == null) {
        // nothing to do
        return;
    }
    DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext();
    // collect the dynamic pruning conditions
    removerContext.dynLists.clear();
    GenTezUtils.collectDynamicPruningConditions(ts.getConf().getFilterExpr(), removerContext);
    for (DynamicListContext ctx : removerContext) {
        // remove the condition by replacing it with "true"
        ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
        if (ctx.grandParent == null) {
            // we're the only node, just clear out the expression
            ts.getConf().setFilterExpr(null);
        } else {
            int i = ctx.grandParent.getChildren().indexOf(ctx.parent);
            ctx.grandParent.getChildren().remove(i);
            ctx.grandParent.getChildren().add(i, constNode);
        }
    }
}
Also used : ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) DynamicListContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext) DynamicPartitionPrunerContext(org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicPartitionPrunerContext) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint)

Aggregations

DynamicListContext (org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicListContext)2 DynamicPartitionPrunerContext (org.apache.hadoop.hive.ql.parse.GenTezUtils.DynamicPartitionPrunerContext)2 SemiJoinHint (org.apache.hadoop.hive.ql.parse.SemiJoinHint)2 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2 Preconditions (com.google.common.base.Preconditions)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 EnumSet (java.util.EnumSet)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Stack (java.util.Stack)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 ConfVars (org.apache.hadoop.hive.conf.HiveConf.ConfVars)1 MetaStoreUtils (org.apache.hadoop.hive.metastore.utils.MetaStoreUtils)1 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)1 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)1 FunctionRegistry (org.apache.hadoop.hive.ql.exec.FunctionRegistry)1