Search in sources :

Example 1 with SemiJoinBranchInfo

use of org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo in project hive by apache.

the class SharedWorkOptimizer method gatherDPPTableScanOps.

/**
 * This method gathers the TS operators with DPP from the context and
 * stores them into the input optimization cache.
 */
private static void gatherDPPTableScanOps(ParseContext pctx, SharedWorkOptimizerCache optimizerCache) throws SemanticException {
    // Find TS operators with partition pruning enabled in plan
    // because these TS may potentially read different data for
    // different pipeline.
    // These can be:
    // 1) TS with DPP.
    // 2) TS with semijoin DPP.
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    Collection<Operator<? extends OperatorDesc>> tableScanOps = Lists.<Operator<?>>newArrayList(topOps.values());
    Set<AppMasterEventOperator> s = OperatorUtils.findOperators(tableScanOps, AppMasterEventOperator.class);
    for (AppMasterEventOperator a : s) {
        if (a.getConf() instanceof DynamicPruningEventDesc) {
            DynamicPruningEventDesc dped = (DynamicPruningEventDesc) a.getConf();
            optimizerCache.tableScanToDPPSource.put(dped.getTableScan(), a);
        }
    }
    for (Entry<ReduceSinkOperator, SemiJoinBranchInfo> e : pctx.getRsToSemiJoinBranchInfo().entrySet()) {
        optimizerCache.tableScanToDPPSource.put(e.getValue().getTsOp(), e.getKey());
    }
    LOG.debug("DPP information stored in the cache: {}", optimizerCache.tableScanToDPPSource);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 2 with SemiJoinBranchInfo

use of org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo in project hive by apache.

the class DynamicPartitionPruningOptimization method createFinalRsForSemiJoinOp.

private void createFinalRsForSemiJoinOp(ParseContext parseContext, TableScanOperator ts, GroupByOperator gb, ExprNodeDesc key, String keyBaseAlias, ExprNodeDesc colExpr, boolean isHint) throws SemanticException {
    ArrayList<String> gbOutputNames = new ArrayList<>();
    // One each for min, max and bloom filter
    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0));
    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1));
    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2));
    int colPos = 0;
    ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
    for (int i = 0; i < gbOutputNames.size() - 1; i++) {
        ExprNodeColumnDesc expr = new ExprNodeColumnDesc(key.getTypeInfo(), gbOutputNames.get(colPos++), "", false);
        rsValueCols.add(expr);
    }
    // Bloom Filter uses binary
    ExprNodeColumnDesc colBFExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, gbOutputNames.get(colPos++), "", false);
    rsValueCols.add(colBFExpr);
    // Create the final Reduce Sink Operator
    ReduceSinkDesc rsDescFinal = PlanUtils.getReduceSinkDesc(new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false, -1, 0, 1, Operation.NOT_ACID);
    ReduceSinkOperator rsOpFinal = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDescFinal, new RowSchema(gb.getSchema()), gb);
    Map<String, ExprNodeDesc> columnExprMap = new HashMap<>();
    rsOpFinal.setColumnExprMap(columnExprMap);
    LOG.debug("DynamicSemiJoinPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts);
    SemiJoinBranchInfo sjInfo = new SemiJoinBranchInfo(ts, isHint);
    parseContext.getRsToSemiJoinBranchInfo().put(rsOpFinal, sjInfo);
    // Save the info that is required at query time to resolve dynamic/runtime values.
    RuntimeValuesInfo runtimeValuesInfo = new RuntimeValuesInfo();
    TableDesc rsFinalTableDesc = PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(rsValueCols, "_col"));
    List<String> dynamicValueIDs = new ArrayList<String>();
    dynamicValueIDs.add(keyBaseAlias + "_min");
    dynamicValueIDs.add(keyBaseAlias + "_max");
    dynamicValueIDs.add(keyBaseAlias + "_bloom_filter");
    runtimeValuesInfo.setTableDesc(rsFinalTableDesc);
    runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs);
    runtimeValuesInfo.setColExprs(rsValueCols);
    runtimeValuesInfo.setTsColExpr(colExpr);
    parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
    parseContext.getColExprToGBMap().put(key, gb);
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint) RuntimeValuesInfo(org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 3 with SemiJoinBranchInfo

use of org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo in project hive by apache.

the class SharedWorkOptimizer method findDescendantWorkOperators.

private static Set<Operator<?>> findDescendantWorkOperators(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> start, Set<Operator<?>> excludeOps) {
    // Find operators in work
    Set<Operator<?>> workOps = findWorkOperators(optimizerCache, start);
    // Gather output works operators
    Set<Operator<?>> result = new HashSet<Operator<?>>();
    Set<Operator<?>> set;
    while (!workOps.isEmpty()) {
        set = new HashSet<Operator<?>>();
        for (Operator<?> op : workOps) {
            if (excludeOps.contains(op)) {
                continue;
            }
            if (op instanceof ReduceSinkOperator) {
                if (op.getChildOperators() != null) {
                    // All children of RS are descendants
                    for (Operator<?> child : op.getChildOperators()) {
                        set.addAll(findWorkOperators(optimizerCache, child));
                    }
                }
                // Semijoin DPP work is considered a descendant because work needs
                // to finish for it to execute
                SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
                if (sjbi != null) {
                    set.addAll(findWorkOperators(optimizerCache, sjbi.getTsOp()));
                }
            } else if (op.getConf() instanceof DynamicPruningEventDesc) {
                // DPP work is considered a descendant because work needs
                // to finish for it to execute
                set.addAll(findWorkOperators(optimizerCache, ((DynamicPruningEventDesc) op.getConf()).getTableScan()));
            }
        }
        workOps = set;
        result.addAll(set);
    }
    return result;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 4 with SemiJoinBranchInfo

use of org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo in project hive by apache.

the class SharedWorkOptimizer method transform.

@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    final Map<String, TableScanOperator> topOps = pctx.getTopOps();
    if (topOps.size() < 2) {
        // Nothing to do, bail out
        return pctx;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Before SharedWorkOptimizer:\n" + Operator.toString(pctx.getTopOps().values()));
    }
    // Cache to use during optimization
    SharedWorkOptimizerCache optimizerCache = new SharedWorkOptimizerCache();
    // Gather information about the DPP table scans and store it in the cache
    gatherDPPTableScanOps(pctx, optimizerCache);
    // Map of dbName.TblName -> TSOperator
    Multimap<String, TableScanOperator> tableNameToOps = splitTableScanOpsByTable(pctx);
    // We enforce a certain order when we do the reutilization.
    // In particular, we use size of table x number of reads to
    // rank the tables.
    List<Entry<String, Long>> sortedTables = rankTablesByAccumulatedSize(pctx);
    LOG.debug("Sorted tables by size: {}", sortedTables);
    // Execute optimization
    Multimap<String, TableScanOperator> existingOps = ArrayListMultimap.create();
    Set<Operator<?>> removedOps = new HashSet<>();
    for (Entry<String, Long> tablePair : sortedTables) {
        String tableName = tablePair.getKey();
        for (TableScanOperator discardableTsOp : tableNameToOps.get(tableName)) {
            if (removedOps.contains(discardableTsOp)) {
                LOG.debug("Skip {} as it has already been removed", discardableTsOp);
                continue;
            }
            Collection<TableScanOperator> prevTsOps = existingOps.get(tableName);
            for (TableScanOperator retainableTsOp : prevTsOps) {
                if (removedOps.contains(retainableTsOp)) {
                    LOG.debug("Skip {} as it has already been removed", retainableTsOp);
                    continue;
                }
                // First we quickly check if the two table scan operators can actually be merged
                boolean mergeable = areMergeable(pctx, optimizerCache, retainableTsOp, discardableTsOp);
                if (!mergeable) {
                    // Skip
                    LOG.debug("{} and {} cannot be merged", retainableTsOp, discardableTsOp);
                    continue;
                }
                // Secondly, we extract information about the part of the tree that can be merged
                // as well as some structural information (memory consumption) that needs to be
                // used to determined whether the merge can happen
                SharedResult sr = extractSharedOptimizationInfoForRoot(pctx, optimizerCache, retainableTsOp, discardableTsOp);
                // tables.
                if (!validPreConditions(pctx, optimizerCache, sr)) {
                    // Skip
                    LOG.debug("{} and {} do not meet preconditions", retainableTsOp, discardableTsOp);
                    continue;
                }
                // We can merge
                if (sr.retainableOps.size() > 1) {
                    // More than TS operator
                    Operator<?> lastRetainableOp = sr.retainableOps.get(sr.retainableOps.size() - 1);
                    Operator<?> lastDiscardableOp = sr.discardableOps.get(sr.discardableOps.size() - 1);
                    if (lastDiscardableOp.getNumChild() != 0) {
                        List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(lastDiscardableOp.getChildOperators());
                        for (Operator<? extends OperatorDesc> op : allChildren) {
                            lastDiscardableOp.getChildOperators().remove(op);
                            op.replaceParent(lastDiscardableOp, lastRetainableOp);
                            lastRetainableOp.getChildOperators().add(op);
                        }
                    }
                    LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableTsOp, retainableTsOp);
                } else {
                    // Only TS operator
                    ExprNodeGenericFuncDesc exprNode = null;
                    if (retainableTsOp.getConf().getFilterExpr() != null) {
                        // Push filter on top of children
                        pushFilterToTopOfTableScan(optimizerCache, retainableTsOp);
                        // Clone to push to table scan
                        exprNode = (ExprNodeGenericFuncDesc) retainableTsOp.getConf().getFilterExpr();
                    }
                    if (discardableTsOp.getConf().getFilterExpr() != null) {
                        // Push filter on top
                        pushFilterToTopOfTableScan(optimizerCache, discardableTsOp);
                        ExprNodeGenericFuncDesc tsExprNode = discardableTsOp.getConf().getFilterExpr();
                        if (exprNode != null && !exprNode.isSame(tsExprNode)) {
                            // We merge filters from previous scan by ORing with filters from current scan
                            if (exprNode.getGenericUDF() instanceof GenericUDFOPOr) {
                                List<ExprNodeDesc> newChildren = new ArrayList<>(exprNode.getChildren().size() + 1);
                                for (ExprNodeDesc childExprNode : exprNode.getChildren()) {
                                    if (childExprNode.isSame(tsExprNode)) {
                                        // We do not need to do anything, it is in the OR expression
                                        break;
                                    }
                                    newChildren.add(childExprNode);
                                }
                                if (exprNode.getChildren().size() == newChildren.size()) {
                                    newChildren.add(tsExprNode);
                                    exprNode = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), newChildren);
                                }
                            } else {
                                exprNode = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), Arrays.<ExprNodeDesc>asList(exprNode, tsExprNode));
                            }
                        }
                    }
                    // Replace filter
                    retainableTsOp.getConf().setFilterExpr(exprNode);
                    // Replace table scan operator
                    List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(discardableTsOp.getChildOperators());
                    for (Operator<? extends OperatorDesc> op : allChildren) {
                        discardableTsOp.getChildOperators().remove(op);
                        op.replaceParent(discardableTsOp, retainableTsOp);
                        retainableTsOp.getChildOperators().add(op);
                    }
                    LOG.debug("Merging {} into {}", discardableTsOp, retainableTsOp);
                }
                // we are going to eliminate
                for (Operator<?> op : sr.discardableInputOps) {
                    OperatorUtils.removeOperator(op);
                    optimizerCache.removeOp(op);
                    removedOps.add(op);
                    // Remove DPP predicates
                    if (op instanceof ReduceSinkOperator) {
                        SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
                        if (sjbi != null && !sr.discardableOps.contains(sjbi.getTsOp()) && !sr.discardableInputOps.contains(sjbi.getTsOp())) {
                            GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) op, sjbi.getTsOp());
                            optimizerCache.tableScanToDPPSource.remove(sjbi.getTsOp(), op);
                        }
                    } else if (op instanceof AppMasterEventOperator) {
                        DynamicPruningEventDesc dped = (DynamicPruningEventDesc) op.getConf();
                        if (!sr.discardableOps.contains(dped.getTableScan()) && !sr.discardableInputOps.contains(dped.getTableScan())) {
                            GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) op, dped.getTableScan());
                            optimizerCache.tableScanToDPPSource.remove(dped.getTableScan(), op);
                        }
                    }
                    LOG.debug("Input operator removed: {}", op);
                }
                // Then we merge the operators of the works we are going to merge
                optimizerCache.removeOpAndCombineWork(discardableTsOp, retainableTsOp);
                removedOps.add(discardableTsOp);
                // Finally we remove the expression from the tree
                for (Operator<?> op : sr.discardableOps) {
                    OperatorUtils.removeOperator(op);
                    optimizerCache.removeOp(op);
                    removedOps.add(op);
                    if (sr.discardableOps.size() == 1) {
                        // If there is a single discardable operator, it is a TableScanOperator
                        // and it means that we have merged filter expressions for it. Thus, we
                        // might need to remove DPP predicates from the retainable TableScanOperator
                        Collection<Operator<?>> c = optimizerCache.tableScanToDPPSource.get((TableScanOperator) op);
                        for (Operator<?> dppSource : c) {
                            if (dppSource instanceof ReduceSinkOperator) {
                                GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) dppSource, (TableScanOperator) sr.retainableOps.get(0));
                                optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op);
                            } else if (dppSource instanceof AppMasterEventOperator) {
                                GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) dppSource, (TableScanOperator) sr.retainableOps.get(0));
                                optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op);
                            }
                        }
                    }
                    LOG.debug("Operator removed: {}", op);
                }
                break;
            }
            if (removedOps.contains(discardableTsOp)) {
                // This operator has been removed, remove it from the list of existing operators
                existingOps.remove(tableName, discardableTsOp);
            } else {
                // This operator has not been removed, include it in the list of existing operators
                existingOps.put(tableName, discardableTsOp);
            }
        }
    }
    // Remove unused table scan operators
    Iterator<Entry<String, TableScanOperator>> it = topOps.entrySet().iterator();
    while (it.hasNext()) {
        Entry<String, TableScanOperator> e = it.next();
        if (e.getValue().getNumChild() == 0) {
            it.remove();
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("After SharedWorkOptimizer:\n" + Operator.toString(pctx.getTopOps().values()));
    }
    if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_EXTENDED_OPTIMIZATION)) {
        // Gather RS operators that 1) belong to root works, i.e., works containing TS operators,
        // and 2) share the same input operator.
        // These will be the first target for extended shared work optimization
        Multimap<Operator<?>, ReduceSinkOperator> parentToRsOps = ArrayListMultimap.create();
        Set<Operator<?>> visited = new HashSet<>();
        for (Entry<String, TableScanOperator> e : topOps.entrySet()) {
            gatherReduceSinkOpsByInput(parentToRsOps, visited, findWorkOperators(optimizerCache, e.getValue()));
        }
        while (!parentToRsOps.isEmpty()) {
            // As above, we enforce a certain order when we do the reutilization.
            // In particular, we use size of data in RS x number of uses.
            List<Entry<Operator<?>, Long>> sortedRSGroups = rankOpsByAccumulatedSize(parentToRsOps.keySet());
            LOG.debug("Sorted operators by size: {}", sortedRSGroups);
            // Execute extended optimization
            // For each RS, check whether other RS in same work could be merge into this one.
            // If they are merged, RS operators in the resulting work will be considered
            // mergeable in next loop iteration.
            Multimap<Operator<?>, ReduceSinkOperator> existingRsOps = ArrayListMultimap.create();
            for (Entry<Operator<?>, Long> rsGroupInfo : sortedRSGroups) {
                Operator<?> rsParent = rsGroupInfo.getKey();
                for (ReduceSinkOperator discardableRsOp : parentToRsOps.get(rsParent)) {
                    if (removedOps.contains(discardableRsOp)) {
                        LOG.debug("Skip {} as it has already been removed", discardableRsOp);
                        continue;
                    }
                    Collection<ReduceSinkOperator> otherRsOps = existingRsOps.get(rsParent);
                    for (ReduceSinkOperator retainableRsOp : otherRsOps) {
                        if (removedOps.contains(retainableRsOp)) {
                            LOG.debug("Skip {} as it has already been removed", retainableRsOp);
                            continue;
                        }
                        // First we quickly check if the two RS operators can actually be merged.
                        // We already know that these two RS operators have the same parent, but
                        // we need to check whether both RS are actually equal. Further, we check
                        // whether their child is also equal. If any of these conditions are not
                        // met, we are not going to try to merge.
                        boolean mergeable = compareOperator(pctx, retainableRsOp, discardableRsOp) && compareOperator(pctx, retainableRsOp.getChildOperators().get(0), discardableRsOp.getChildOperators().get(0));
                        if (!mergeable) {
                            // Skip
                            LOG.debug("{} and {} cannot be merged", retainableRsOp, discardableRsOp);
                            continue;
                        }
                        LOG.debug("Checking additional conditions for merging subtree starting at {}" + " into subtree starting at {}", discardableRsOp, retainableRsOp);
                        // Secondly, we extract information about the part of the tree that can be merged
                        // as well as some structural information (memory consumption) that needs to be
                        // used to determined whether the merge can happen
                        Operator<?> retainableRsOpChild = retainableRsOp.getChildOperators().get(0);
                        Operator<?> discardableRsOpChild = discardableRsOp.getChildOperators().get(0);
                        SharedResult sr = extractSharedOptimizationInfo(pctx, optimizerCache, retainableRsOp, discardableRsOp, retainableRsOpChild, discardableRsOpChild);
                        // tables.
                        if (sr.retainableOps.isEmpty() || !validPreConditions(pctx, optimizerCache, sr)) {
                            // Skip
                            LOG.debug("{} and {} do not meet preconditions", retainableRsOp, discardableRsOp);
                            continue;
                        }
                        // We can merge
                        Operator<?> lastRetainableOp = sr.retainableOps.get(sr.retainableOps.size() - 1);
                        Operator<?> lastDiscardableOp = sr.discardableOps.get(sr.discardableOps.size() - 1);
                        if (lastDiscardableOp.getNumChild() != 0) {
                            List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(lastDiscardableOp.getChildOperators());
                            for (Operator<? extends OperatorDesc> op : allChildren) {
                                lastDiscardableOp.getChildOperators().remove(op);
                                op.replaceParent(lastDiscardableOp, lastRetainableOp);
                                lastRetainableOp.getChildOperators().add(op);
                            }
                        }
                        LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableRsOp, retainableRsOp);
                        // we are going to eliminate
                        for (Operator<?> op : sr.discardableInputOps) {
                            OperatorUtils.removeOperator(op);
                            optimizerCache.removeOp(op);
                            removedOps.add(op);
                            // Remove DPP predicates
                            if (op instanceof ReduceSinkOperator) {
                                SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
                                if (sjbi != null && !sr.discardableOps.contains(sjbi.getTsOp()) && !sr.discardableInputOps.contains(sjbi.getTsOp())) {
                                    GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) op, sjbi.getTsOp());
                                    optimizerCache.tableScanToDPPSource.remove(sjbi.getTsOp(), op);
                                }
                            } else if (op instanceof AppMasterEventOperator) {
                                DynamicPruningEventDesc dped = (DynamicPruningEventDesc) op.getConf();
                                if (!sr.discardableOps.contains(dped.getTableScan()) && !sr.discardableInputOps.contains(dped.getTableScan())) {
                                    GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) op, dped.getTableScan());
                                    optimizerCache.tableScanToDPPSource.remove(dped.getTableScan(), op);
                                }
                            }
                            LOG.debug("Input operator removed: {}", op);
                        }
                        // We remove the discardable RS operator
                        OperatorUtils.removeOperator(discardableRsOp);
                        optimizerCache.removeOp(discardableRsOp);
                        removedOps.add(discardableRsOp);
                        LOG.debug("Operator removed: {}", discardableRsOp);
                        // Then we merge the operators of the works we are going to merge
                        optimizerCache.removeOpAndCombineWork(discardableRsOpChild, retainableRsOpChild);
                        // Finally we remove the rest of the expression from the tree
                        for (Operator<?> op : sr.discardableOps) {
                            OperatorUtils.removeOperator(op);
                            optimizerCache.removeOp(op);
                            removedOps.add(op);
                            LOG.debug("Operator removed: {}", op);
                        }
                        break;
                    }
                    if (removedOps.contains(discardableRsOp)) {
                        // This operator has been removed, remove it from the list of existing operators
                        existingRsOps.remove(rsParent, discardableRsOp);
                    } else {
                        // This operator has not been removed, include it in the list of existing operators
                        existingRsOps.put(rsParent, discardableRsOp);
                    }
                }
            }
            // We gather the operators that will be used for next iteration of extended optimization
            // (if any)
            parentToRsOps = ArrayListMultimap.create();
            visited = new HashSet<>();
            for (Entry<Operator<?>, ReduceSinkOperator> e : existingRsOps.entries()) {
                if (removedOps.contains(e.getValue()) || e.getValue().getNumChild() < 1) {
                    // semijoin RS), we can quickly skip this one
                    continue;
                }
                gatherReduceSinkOpsByInput(parentToRsOps, visited, findWorkOperators(optimizerCache, e.getValue().getChildOperators().get(0)));
            }
        }
        // Remove unused table scan operators
        it = topOps.entrySet().iterator();
        while (it.hasNext()) {
            Entry<String, TableScanOperator> e = it.next();
            if (e.getValue().getNumChild() == 0) {
                it.remove();
            }
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("After SharedWorkExtendedOptimizer:\n" + Operator.toString(pctx.getTopOps().values()));
        }
    }
    // we use the basic or the extended version of the optimizer.
    if (pctx.getConf().getBoolVar(ConfVars.HIVE_IN_TEST)) {
        Set<Operator<?>> visited = new HashSet<>();
        it = topOps.entrySet().iterator();
        while (it.hasNext()) {
            Entry<String, TableScanOperator> e = it.next();
            for (Operator<?> op : OperatorUtils.findOperators(e.getValue(), Operator.class)) {
                if (!visited.contains(op)) {
                    if (!findWorkOperators(optimizerCache, op).equals(findWorkOperators(op, new HashSet<Operator<?>>()))) {
                        throw new SemanticException("Error in shared work optimizer: operator cache contents" + "and actual plan differ");
                    }
                    visited.add(op);
                }
            }
        }
    }
    return pctx;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ArrayList(java.util.ArrayList) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) Entry(java.util.Map.Entry) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) GenericUDFOPOr(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr)

Example 5 with SemiJoinBranchInfo

use of org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo in project hive by apache.

the class SharedWorkOptimizer method findChildWorkOperators.

private static Set<Operator<?>> findChildWorkOperators(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> start) {
    // Find operators in work
    Set<Operator<?>> workOps = findWorkOperators(optimizerCache, start);
    // Gather output works operators
    Set<Operator<?>> set = new HashSet<Operator<?>>();
    for (Operator<?> op : workOps) {
        if (op instanceof ReduceSinkOperator) {
            if (op.getChildOperators() != null) {
                // All children of RS are descendants
                for (Operator<?> child : op.getChildOperators()) {
                    set.addAll(findWorkOperators(optimizerCache, child));
                }
            }
            // Semijoin DPP work is considered a child because work needs
            // to finish for it to execute
            SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
            if (sjbi != null) {
                set.addAll(findWorkOperators(optimizerCache, sjbi.getTsOp()));
            }
        } else if (op.getConf() instanceof DynamicPruningEventDesc) {
            // DPP work is considered a child because work needs
            // to finish for it to execute
            set.addAll(findWorkOperators(optimizerCache, ((DynamicPruningEventDesc) op.getConf()).getTableScan()));
        }
    }
    return set;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 SemiJoinBranchInfo (org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo)5 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)4 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)4 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)4 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)4 Operator (org.apache.hadoop.hive.ql.exec.Operator)4 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)4 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)4 DynamicPruningEventDesc (org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)4 HashSet (java.util.HashSet)3 LinkedHashSet (java.util.LinkedHashSet)3 ArrayList (java.util.ArrayList)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)2 HashMap (java.util.HashMap)1 Entry (java.util.Map.Entry)1 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)1 RuntimeValuesInfo (org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo)1 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)1