Search in sources :

Example 1 with SparkReduceSinkMapJoinProc

use of org.apache.hadoop.hive.ql.optimizer.spark.SparkReduceSinkMapJoinProc in project hive by apache.

the class SparkCompiler method generateTaskTreeHelper.

private void generateTaskTreeHelper(GenSparkProcContext procCtx, List<Node> topNodes) throws SemanticException {
    // create a walker which walks the tree in a DFS manner while maintaining
    // the operator stack. The dispatcher generates the plan from the operator tree
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    GenSparkWork genSparkWork = new GenSparkWork(GenSparkUtils.getUtils());
    opRules.put(new RuleRegExp("Split Work - ReduceSink", ReduceSinkOperator.getOperatorName() + "%"), genSparkWork);
    opRules.put(new RuleRegExp("Split Work - SparkPartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), genSparkWork);
    opRules.put(new TypeRule(MapJoinOperator.class), new SparkReduceSinkMapJoinProc());
    opRules.put(new RuleRegExp("Split Work + Move/Merge - FileSink", FileSinkOperator.getOperatorName() + "%"), new CompositeProcessor(new SparkFileSinkProcessor(), genSparkWork));
    opRules.put(new RuleRegExp("Handle Analyze Command", TableScanOperator.getOperatorName() + "%"), new SparkProcessAnalyzeTable(GenSparkUtils.getUtils()));
    opRules.put(new RuleRegExp("Remember union", UnionOperator.getOperatorName() + "%"), new NodeProcessor() {

        @Override
        public Object process(Node n, Stack<Node> s, NodeProcessorCtx procCtx, Object... os) throws SemanticException {
            GenSparkProcContext context = (GenSparkProcContext) procCtx;
            UnionOperator union = (UnionOperator) n;
            // simply need to remember that we've seen a union.
            context.currentUnionOperators.add(union);
            return null;
        }
    });
    /**
     *  SMB join case:   (Big)   (Small)  (Small)
     *                     TS       TS       TS
     *                      \       |       /
     *                       \      DS     DS
     *                         \   |    /
     *                         SMBJoinOP
     *
     * Some of the other processors are expecting only one traversal beyond SMBJoinOp.
     * We need to traverse from the big-table path only, and stop traversing on the
     * small-table path once we reach SMBJoinOp.
     * Also add some SMB join information to the context, so we can properly annotate
     * the MapWork later on.
     */
    opRules.put(new TypeRule(SMBMapJoinOperator.class), new NodeProcessor() {

        @Override
        public Object process(Node currNode, Stack<Node> stack, NodeProcessorCtx procCtx, Object... os) throws SemanticException {
            GenSparkProcContext context = (GenSparkProcContext) procCtx;
            SMBMapJoinOperator currSmbNode = (SMBMapJoinOperator) currNode;
            SparkSMBMapJoinInfo smbMapJoinCtx = context.smbMapJoinCtxMap.get(currSmbNode);
            if (smbMapJoinCtx == null) {
                smbMapJoinCtx = new SparkSMBMapJoinInfo();
                context.smbMapJoinCtxMap.put(currSmbNode, smbMapJoinCtx);
            }
            for (Node stackNode : stack) {
                if (stackNode instanceof DummyStoreOperator) {
                    //If coming from small-table side, do some book-keeping, and skip traversal.
                    smbMapJoinCtx.smallTableRootOps.add(context.currentRootOperator);
                    return true;
                }
            }
            //If coming from big-table side, do some book-keeping, and continue traversal
            smbMapJoinCtx.bigTableRootOp = context.currentRootOperator;
            return false;
        }
    });
    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
    GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
    ogw.startWalking(topNodes, null);
}
Also used : Node(org.apache.hadoop.hive.ql.lib.Node) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) NodeProcessorCtx(org.apache.hadoop.hive.ql.lib.NodeProcessorCtx) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) CompositeProcessor(org.apache.hadoop.hive.ql.lib.CompositeProcessor) SparkReduceSinkMapJoinProc(org.apache.hadoop.hive.ql.optimizer.spark.SparkReduceSinkMapJoinProc) Rule(org.apache.hadoop.hive.ql.lib.Rule) TypeRule(org.apache.hadoop.hive.ql.lib.TypeRule) TypeRule(org.apache.hadoop.hive.ql.lib.TypeRule)

Aggregations

LinkedHashMap (java.util.LinkedHashMap)1 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)1 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)1 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)1 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)1 CompositeProcessor (org.apache.hadoop.hive.ql.lib.CompositeProcessor)1 DefaultGraphWalker (org.apache.hadoop.hive.ql.lib.DefaultGraphWalker)1 DefaultRuleDispatcher (org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher)1 Dispatcher (org.apache.hadoop.hive.ql.lib.Dispatcher)1 GraphWalker (org.apache.hadoop.hive.ql.lib.GraphWalker)1 Node (org.apache.hadoop.hive.ql.lib.Node)1 NodeProcessor (org.apache.hadoop.hive.ql.lib.NodeProcessor)1 NodeProcessorCtx (org.apache.hadoop.hive.ql.lib.NodeProcessorCtx)1 Rule (org.apache.hadoop.hive.ql.lib.Rule)1 RuleRegExp (org.apache.hadoop.hive.ql.lib.RuleRegExp)1 TypeRule (org.apache.hadoop.hive.ql.lib.TypeRule)1 SparkReduceSinkMapJoinProc (org.apache.hadoop.hive.ql.optimizer.spark.SparkReduceSinkMapJoinProc)1 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)1