Search in sources :

Example 6 with ExecDriver

use of org.apache.hadoop.hive.ql.exec.mr.ExecDriver in project hive by apache.

the class BucketingSortingInferenceOptimizer method inferBucketingSorting.

/**
   * For each map reduce task, if it has a reducer, infer whether or not the final output of the
   * reducer is bucketed and/or sorted
   *
   * @param mapRedTasks
   * @throws SemanticException
   */
private void inferBucketingSorting(List<ExecDriver> mapRedTasks) throws SemanticException {
    for (ExecDriver mapRedTask : mapRedTasks) {
        // of the outputs of intermediate map reduce jobs.
        if (!mapRedTask.getWork().isFinalMapRed()) {
            continue;
        }
        if (mapRedTask.getWork().getReduceWork() == null) {
            continue;
        }
        Operator<? extends OperatorDesc> reducer = mapRedTask.getWork().getReduceWork().getReducer();
        // uses sampling, which means it's not bucketed
        boolean disableBucketing = mapRedTask.getWork().getMapWork().getSamplingType() > 0;
        BucketingSortingCtx bCtx = new BucketingSortingCtx(disableBucketing);
        // RuleRegExp rules are used to match operators anywhere in the tree
        // RuleExactMatch rules are used to specify exactly what the tree should look like
        // In particular, this guarantees that the first operator is the reducer
        // (and its parent(s) are ReduceSinkOperators) since it begins walking the tree from
        // the reducer.
        Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
        opRules.put(new RuleRegExp("R1", SelectOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getSelProc());
        // Matches only GroupByOperators which are reducers, rather than map group by operators,
        // or multi group by optimization specific operators
        opRules.put(new RuleExactMatch("R2", new String[] { GroupByOperator.getOperatorName() }), BucketingSortingOpProcFactory.getGroupByProc());
        // Matches only JoinOperators which are reducers, rather than map joins, SMB map joins, etc.
        opRules.put(new RuleExactMatch("R3", new String[] { JoinOperator.getOperatorName() }), BucketingSortingOpProcFactory.getJoinProc());
        opRules.put(new RuleRegExp("R5", FileSinkOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getFileSinkProc());
        opRules.put(new RuleRegExp("R7", FilterOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getFilterProc());
        opRules.put(new RuleRegExp("R8", LimitOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getLimitProc());
        opRules.put(new RuleRegExp("R9", LateralViewForwardOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getLateralViewForwardProc());
        opRules.put(new RuleRegExp("R10", LateralViewJoinOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getLateralViewJoinProc());
        // Matches only ForwardOperators which are preceded by some other operator in the tree,
        // in particular it can't be a reducer (and hence cannot be one of the ForwardOperators
        // added by the multi group by optimization)
        opRules.put(new RuleRegExp("R11", ".+" + ForwardOperator.getOperatorName() + "%"), BucketingSortingOpProcFactory.getForwardProc());
        // Matches only ForwardOperators which are reducers and are followed by GroupByOperators
        // (specific to the multi group by optimization)
        opRules.put(new RuleExactMatch("R12", new String[] { ForwardOperator.getOperatorName(), GroupByOperator.getOperatorName() }), BucketingSortingOpProcFactory.getMultiGroupByProc());
        // The dispatcher fires the processor corresponding to the closest matching rule and passes
        // the context along
        Dispatcher disp = new DefaultRuleDispatcher(BucketingSortingOpProcFactory.getDefaultProc(), opRules, bCtx);
        GraphWalker ogw = new PreOrderWalker(disp);
        // Create a list of topop nodes
        ArrayList<Node> topNodes = new ArrayList<Node>();
        topNodes.add(reducer);
        ogw.startWalking(topNodes, null);
        mapRedTask.getWork().getMapWork().getBucketedColsByDirectory().putAll(bCtx.getBucketedColsByDirectory());
        mapRedTask.getWork().getMapWork().getSortedColsByDirectory().putAll(bCtx.getSortedColsByDirectory());
    }
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) RuleExactMatch(org.apache.hadoop.hive.ql.lib.RuleExactMatch) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) Rule(org.apache.hadoop.hive.ql.lib.Rule) PreOrderWalker(org.apache.hadoop.hive.ql.lib.PreOrderWalker) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker)

Aggregations

ExecDriver (org.apache.hadoop.hive.ql.exec.mr.ExecDriver)6 Serializable (java.io.Serializable)4 ArrayList (java.util.ArrayList)4 Task (org.apache.hadoop.hive.ql.exec.Task)4 LinkedList (java.util.LinkedList)3 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)3 FetchTask (org.apache.hadoop.hive.ql.exec.FetchTask)3 HashSet (java.util.HashSet)2 List (java.util.List)2 Path (org.apache.hadoop.fs.Path)2 Context (org.apache.hadoop.hive.ql.Context)2 ExplainTask (org.apache.hadoop.hive.ql.exec.ExplainTask)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)2 IOException (java.io.IOException)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 ContentSummary (org.apache.hadoop.fs.ContentSummary)1 PathFilter (org.apache.hadoop.fs.PathFilter)1 Warehouse (org.apache.hadoop.hive.metastore.Warehouse)1