Search in sources :

Example 1 with SparkTask

use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.

the class SparkCompiler method generateTaskTree.

/**
 * TODO: need to turn on rules that's commented out and add more if necessary.
 */
@Override
protected void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
    PERF_LOGGER.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
    GenSparkUtils utils = GenSparkUtils.getUtils();
    utils.resetSequenceNumber();
    ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
    GenSparkProcContext procCtx = new GenSparkProcContext(conf, tempParseContext, mvTask, rootTasks, inputs, outputs, pCtx.getTopOps());
    // -------------------------------- First Pass ---------------------------------- //
    // Identify SparkPartitionPruningSinkOperators, and break OP tree if necessary
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("Clone OP tree for PartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SplitOpTreeForDPP());
    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
    GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
    List<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pCtx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    // -------------------------------- Second Pass ---------------------------------- //
    // Process operator tree in two steps: first we process the extra op trees generated
    // in the first pass. Then we process the main op tree, and the result task will depend
    // on the task generated in the first pass.
    topNodes.clear();
    topNodes.addAll(procCtx.topOps.values());
    generateTaskTreeHelper(procCtx, topNodes);
    // the partitions used.
    if (!procCtx.clonedPruningTableScanSet.isEmpty()) {
        SparkTask pruningTask = SparkUtilities.createSparkTask(conf);
        SparkTask mainTask = procCtx.currentTask;
        pruningTask.addDependentTask(procCtx.currentTask);
        procCtx.rootTasks.remove(procCtx.currentTask);
        procCtx.rootTasks.add(pruningTask);
        procCtx.currentTask = pruningTask;
        topNodes.clear();
        topNodes.addAll(procCtx.clonedPruningTableScanSet);
        generateTaskTreeHelper(procCtx, topNodes);
        procCtx.currentTask = mainTask;
    }
    // we need to clone some operator plans and remove union operators still
    for (BaseWork w : procCtx.workWithUnionOperators) {
        GenSparkUtils.getUtils().removeUnionOperators(procCtx, w);
    }
    // we need to fill MapWork with 'local' work and bucket information for SMB Join.
    GenSparkUtils.getUtils().annotateMapWork(procCtx);
    // finally make sure the file sink operators are set up right
    for (FileSinkOperator fileSink : procCtx.fileSinkSet) {
        GenSparkUtils.getUtils().processFileSink(procCtx, fileSink);
    }
    // Process partition pruning sinks
    for (Operator<?> prunerSink : procCtx.pruningSinkSet) {
        utils.processPartitionPruningSink(procCtx, (SparkPartitionPruningSinkOperator) prunerSink);
    }
    PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Rule(org.apache.hadoop.hive.ql.lib.Rule) TypeRule(org.apache.hadoop.hive.ql.lib.TypeRule) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker)

Example 2 with SparkTask

use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.

the class SparkSkewJoinProcFactory method splitTask.

/**
 * If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
 */
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
    SparkWork currentWork = currentTask.getWork();
    Set<Operator<?>> reduceSinkSet = OperatorUtils.getOp(reduceWork, ReduceSinkOperator.class);
    if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
        ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
        BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
        SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
        // disconnect the reduce work from its child. this should produce two isolated sub graphs
        currentWork.disconnect(reduceWork, childWork);
        // move works following the current reduce work into a new spark work
        SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        newWork.add(childWork);
        copyWorkGraph(currentWork, newWork, childWork);
        // remove them from current spark work
        for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
            currentWork.remove(baseWork);
            currentWork.getCloneToWork().remove(baseWork);
        }
        // create TS to read intermediate data
        Context baseCtx = parseContext.getContext();
        Path taskTmpDir = baseCtx.getMRTmpPath();
        Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
        TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
        // this will insert FS and TS between the RS and its parent
        TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
        // create new MapWork
        MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
        mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        newWork.add(mapWork);
        newWork.connect(mapWork, childWork, originEdge);
        // setup the new map work
        String streamDesc = taskTmpDir.toUri().toString();
        if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
            Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
            String id = null;
            if (childReducer instanceof JoinOperator) {
                if (parseContext.getJoinOps().contains(childReducer)) {
                    id = ((JoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof MapJoinOperator) {
                if (parseContext.getMapJoinOps().contains(childReducer)) {
                    id = ((MapJoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof SMBMapJoinOperator) {
                if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
                    id = ((SMBMapJoinOperator) childReducer).getConf().getId();
                }
            }
            if (id != null) {
                streamDesc = id + ":$INTNAME";
            } else {
                streamDesc = "$INTNAME";
            }
            String origStreamDesc = streamDesc;
            int pos = 0;
            while (mapWork.getAliasToWork().get(streamDesc) != null) {
                streamDesc = origStreamDesc.concat(String.valueOf(++pos));
            }
        }
        GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
        // insert the new task between current task and its child
        @SuppressWarnings("unchecked") Task<?> newTask = TaskFactory.get(newWork);
        List<Task<?>> childTasks = currentTask.getChildTasks();
        // must have at most one child
        if (childTasks != null && childTasks.size() > 0) {
            Task<?> childTask = childTasks.get(0);
            currentTask.removeDependentTask(childTask);
            newTask.addDependentTask(childTask);
        }
        currentTask.addDependentTask(newTask);
        newTask.setFetchSource(currentTask.isFetchSource());
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Path(org.apache.hadoop.fs.Path) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 3 with SparkTask

use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.

the class SparkSkewJoinProcFactory method supportRuntimeSkewJoin.

private static boolean supportRuntimeSkewJoin(JoinOperator joinOp, ReduceWork reduceWork, Task<?> currTask, HiveConf hiveConf) {
    if (currTask instanceof SparkTask && GenMRSkewJoinProcessor.skewJoinEnabled(hiveConf, joinOp)) {
        SparkWork sparkWork = ((SparkTask) currTask).getWork();
        List<Task<?>> children = currTask.getChildTasks();
        return !joinOp.getConf().isFixedAsSorted() && sparkWork.contains(reduceWork) && (children == null || children.size() <= 1) && OperatorUtils.getOp(reduceWork, CommonJoinOperator.class).size() == 1;
    }
    return false;
}
Also used : SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork)

Example 4 with SparkTask

use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.

the class TestUtilities method testGetTasksHaveNoRepeats.

/**
 * This test tests that Utilities.get*Tasks do not repeat themselves in the process
 * of extracting tasks from a given set of root tasks when given DAGs that can have
 * multiple paths, such as the case with Diamond-shaped DAGs common to replication.
 */
@Test
public void testGetTasksHaveNoRepeats() {
    CountingWrappingTask mrTask = new CountingWrappingTask(new ExecDriver());
    CountingWrappingTask tezTask = new CountingWrappingTask(new TezTask());
    CountingWrappingTask sparkTask = new CountingWrappingTask(new SparkTask());
    // First check - we should not have repeats in results
    assertEquals("No repeated MRTasks from Utilities.getMRTasks", 1, Utilities.getMRTasks(getTestDiamondTaskGraph(mrTask)).size());
    assertEquals("No repeated TezTasks from Utilities.getTezTasks", 1, Utilities.getTezTasks(getTestDiamondTaskGraph(tezTask)).size());
    assertEquals("No repeated TezTasks from Utilities.getSparkTasks", 1, Utilities.getSparkTasks(getTestDiamondTaskGraph(sparkTask)).size());
    // Second check - the tasks we looked for must not have been accessed more than
    // once as a result of the traversal (note that we actually wind up accessing
    // 2 times , because each visit counts twice, once to check for existence, and
    // once to visit.
    assertEquals("MRTasks should have been visited only once", 2, mrTask.getDepCallCount());
    assertEquals("TezTasks should have been visited only once", 2, tezTask.getDepCallCount());
    assertEquals("SparkTasks should have been visited only once", 2, sparkTask.getDepCallCount());
}
Also used : SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) Test(org.junit.Test)

Example 5 with SparkTask

use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.

the class SparkCrossProductCheck method dispatch.

@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
    @SuppressWarnings("unchecked") Task<?> currTask = (Task<?>) nd;
    if (currTask instanceof SparkTask) {
        SparkWork sparkWork = ((SparkTask) currTask).getWork();
        checkShuffleJoin(sparkWork);
        checkMapJoin((SparkTask) currTask);
    } else if (currTask instanceof ConditionalTask) {
        List<Task<?>> taskList = ((ConditionalTask) currTask).getListTasks();
        for (Task<?> task : taskList) {
            dispatch(task, stack, nodeOutputs);
        }
    }
    return null;
}
Also used : SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ArrayList(java.util.ArrayList) List(java.util.List) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork)

Aggregations

SparkTask (org.apache.hadoop.hive.ql.exec.spark.SparkTask)12 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)9 Task (org.apache.hadoop.hive.ql.exec.Task)8 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)8 ArrayList (java.util.ArrayList)7 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)6 List (java.util.List)5 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)5 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)5 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)5 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)5 Serializable (java.io.Serializable)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)4 Path (org.apache.hadoop.fs.Path)3 CommonJoinOperator (org.apache.hadoop.hive.ql.exec.CommonJoinOperator)3 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)3 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)3 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)3