Search in sources :

Example 26 with Node

use of org.apache.hadoop.hive.ql.lib.Node in project hive by apache.

the class SparkCompiler method generateTaskTree.

/**
   * TODO: need to turn on rules that's commented out and add more if necessary.
   */
@Override
protected void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
    PERF_LOGGER.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
    GenSparkUtils utils = GenSparkUtils.getUtils();
    utils.resetSequenceNumber();
    ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
    GenSparkProcContext procCtx = new GenSparkProcContext(conf, tempParseContext, mvTask, rootTasks, inputs, outputs, pCtx.getTopOps());
    // -------------------------------- First Pass ---------------------------------- //
    // Identify SparkPartitionPruningSinkOperators, and break OP tree if necessary
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("Clone OP tree for PartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SplitOpTreeForDPP());
    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
    GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
    List<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pCtx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    // -------------------------------- Second Pass ---------------------------------- //
    // Process operator tree in two steps: first we process the extra op trees generated
    // in the first pass. Then we process the main op tree, and the result task will depend
    // on the task generated in the first pass.
    topNodes.clear();
    topNodes.addAll(procCtx.topOps.values());
    generateTaskTreeHelper(procCtx, topNodes);
    // the partitions used.
    if (!procCtx.clonedPruningTableScanSet.isEmpty()) {
        SparkTask pruningTask = SparkUtilities.createSparkTask(conf);
        SparkTask mainTask = procCtx.currentTask;
        pruningTask.addDependentTask(procCtx.currentTask);
        procCtx.rootTasks.remove(procCtx.currentTask);
        procCtx.rootTasks.add(pruningTask);
        procCtx.currentTask = pruningTask;
        topNodes.clear();
        topNodes.addAll(procCtx.clonedPruningTableScanSet);
        generateTaskTreeHelper(procCtx, topNodes);
        procCtx.currentTask = mainTask;
    }
    // we need to clone some operator plans and remove union operators still
    for (BaseWork w : procCtx.workWithUnionOperators) {
        GenSparkUtils.getUtils().removeUnionOperators(procCtx, w);
    }
    // we need to fill MapWork with 'local' work and bucket information for SMB Join.
    GenSparkUtils.getUtils().annotateMapWork(procCtx);
    // finally make sure the file sink operators are set up right
    for (FileSinkOperator fileSink : procCtx.fileSinkSet) {
        GenSparkUtils.getUtils().processFileSink(procCtx, fileSink);
    }
    // Process partition pruning sinks
    for (Operator<?> prunerSink : procCtx.pruningSinkSet) {
        utils.processPartitionPruningSink(procCtx, (SparkPartitionPruningSinkOperator) prunerSink);
    }
    PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Rule(org.apache.hadoop.hive.ql.lib.Rule) TypeRule(org.apache.hadoop.hive.ql.lib.TypeRule) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker)

Example 27 with Node

use of org.apache.hadoop.hive.ql.lib.Node in project hive by apache.

the class SparkCompiler method runJoinOptimizations.

private void runJoinOptimizations(OptimizeSparkProcContext procCtx) throws SemanticException {
    ParseContext pCtx = procCtx.getParseContext();
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new TypeRule(JoinOperator.class), new SparkJoinOptimizer(pCtx));
    opRules.put(new TypeRule(MapJoinOperator.class), new SparkJoinHintOptimizer(pCtx));
    opRules.put(new RuleRegExp("Disabling Dynamic Partition Pruning By Size", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SparkRemoveDynamicPruningBySize());
    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);
    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pCtx.getTopOps().values());
    ogw.startWalking(topNodes, null);
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) SparkRemoveDynamicPruningBySize(org.apache.hadoop.hive.ql.optimizer.SparkRemoveDynamicPruningBySize) SparkJoinHintOptimizer(org.apache.hadoop.hive.ql.optimizer.spark.SparkJoinHintOptimizer) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) SparkJoinOptimizer(org.apache.hadoop.hive.ql.optimizer.spark.SparkJoinOptimizer) LinkedHashMap(java.util.LinkedHashMap) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Rule(org.apache.hadoop.hive.ql.lib.Rule) TypeRule(org.apache.hadoop.hive.ql.lib.TypeRule) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) TypeRule(org.apache.hadoop.hive.ql.lib.TypeRule)

Example 28 with Node

use of org.apache.hadoop.hive.ql.lib.Node in project hive by apache.

the class AnnotateReduceSinkOutputOperator method transform.

@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    // 1. We apply the transformation
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("R1", "(" + ReduceSinkOperator.getOperatorName() + "%)"), new ReduceSinkOutputOperatorAnnotator());
    GraphWalker ogw = new DefaultGraphWalker(new DefaultRuleDispatcher(null, opRules, null));
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) Rule(org.apache.hadoop.hive.ql.lib.Rule) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) LinkedHashMap(java.util.LinkedHashMap)

Example 29 with Node

use of org.apache.hadoop.hive.ql.lib.Node in project hive by apache.

the class BucketMapJoinOptimizer method transform.

public ParseContext transform(ParseContext pctx) throws SemanticException {
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    BucketJoinProcCtx bucketMapJoinOptimizeCtx = new BucketJoinProcCtx(pctx.getConf());
    // process map joins with no reducers pattern
    opRules.put(new RuleRegExp("R1", MapJoinOperator.getOperatorName() + "%"), getBucketMapjoinProc(pctx));
    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, bucketMapJoinOptimizeCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);
    // Create a list of topop nodes
    List<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) Rule(org.apache.hadoop.hive.ql.lib.Rule) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker)

Example 30 with Node

use of org.apache.hadoop.hive.ql.lib.Node in project hive by apache.

the class BucketingSortingReduceSinkOptimizer method transform.

@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    // process reduce sink added by hive.enforce.bucketing or hive.enforce.sorting
    opRules.put(new RuleRegExp("R1", ReduceSinkOperator.getOperatorName() + "%" + SelectOperator.getOperatorName() + "%" + FileSinkOperator.getOperatorName() + "%"), getBucketSortReduceSinkProc(pctx));
    // The dispatcher fires the processor corresponding to the closest matching rule
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
    GraphWalker ogw = new DefaultGraphWalker(disp);
    // Create a list of top nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) Rule(org.apache.hadoop.hive.ql.lib.Rule) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

Node (org.apache.hadoop.hive.ql.lib.Node)103 ArrayList (java.util.ArrayList)87 Dispatcher (org.apache.hadoop.hive.ql.lib.Dispatcher)78 DefaultRuleDispatcher (org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher)71 GraphWalker (org.apache.hadoop.hive.ql.lib.GraphWalker)70 LinkedHashMap (java.util.LinkedHashMap)60 NodeProcessor (org.apache.hadoop.hive.ql.lib.NodeProcessor)59 Rule (org.apache.hadoop.hive.ql.lib.Rule)58 DefaultGraphWalker (org.apache.hadoop.hive.ql.lib.DefaultGraphWalker)56 RuleRegExp (org.apache.hadoop.hive.ql.lib.RuleRegExp)50 HashMap (java.util.HashMap)18 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)14 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)13 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)11 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)11 TaskGraphWalker (org.apache.hadoop.hive.ql.lib.TaskGraphWalker)10 TypeRule (org.apache.hadoop.hive.ql.lib.TypeRule)10 List (java.util.List)8 RelNode (org.apache.calcite.rel.RelNode)8 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)7