Search in sources :

Example 6 with SparkWork

use of org.apache.hadoop.hive.ql.plan.SparkWork in project hive by apache.

the class SparkSkewJoinProcFactory method splitTask.

/**
   * If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
   */
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
    SparkWork currentWork = currentTask.getWork();
    Set<Operator<?>> reduceSinkSet = SparkMapJoinResolver.getOp(reduceWork, ReduceSinkOperator.class);
    if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
        ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
        BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
        SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
        // disconnect the reduce work from its child. this should produce two isolated sub graphs
        currentWork.disconnect(reduceWork, childWork);
        // move works following the current reduce work into a new spark work
        SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        newWork.add(childWork);
        copyWorkGraph(currentWork, newWork, childWork);
        // remove them from current spark work
        for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
            currentWork.remove(baseWork);
            currentWork.getCloneToWork().remove(baseWork);
        }
        // create TS to read intermediate data
        Context baseCtx = parseContext.getContext();
        Path taskTmpDir = baseCtx.getMRTmpPath();
        Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
        TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
        // this will insert FS and TS between the RS and its parent
        TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
        // create new MapWork
        MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
        mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        newWork.add(mapWork);
        newWork.connect(mapWork, childWork, originEdge);
        // setup the new map work
        String streamDesc = taskTmpDir.toUri().toString();
        if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
            Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
            String id = null;
            if (childReducer instanceof JoinOperator) {
                if (parseContext.getJoinOps().contains(childReducer)) {
                    id = ((JoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof MapJoinOperator) {
                if (parseContext.getMapJoinOps().contains(childReducer)) {
                    id = ((MapJoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof SMBMapJoinOperator) {
                if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
                    id = ((SMBMapJoinOperator) childReducer).getConf().getId();
                }
            }
            if (id != null) {
                streamDesc = id + ":$INTNAME";
            } else {
                streamDesc = "$INTNAME";
            }
            String origStreamDesc = streamDesc;
            int pos = 0;
            while (mapWork.getAliasToWork().get(streamDesc) != null) {
                streamDesc = origStreamDesc.concat(String.valueOf(++pos));
            }
        }
        GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
        // insert the new task between current task and its child
        @SuppressWarnings("unchecked") Task<? extends Serializable> newTask = TaskFactory.get(newWork, parseContext.getConf());
        List<Task<? extends Serializable>> childTasks = currentTask.getChildTasks();
        // must have at most one child
        if (childTasks != null && childTasks.size() > 0) {
            Task<? extends Serializable> childTask = childTasks.get(0);
            currentTask.removeDependentTask(childTask);
            newTask.addDependentTask(childTask);
        }
        currentTask.addDependentTask(newTask);
        newTask.setFetchSource(currentTask.isFetchSource());
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Path(org.apache.hadoop.fs.Path) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 7 with SparkWork

use of org.apache.hadoop.hive.ql.plan.SparkWork in project hive by apache.

the class SparkCrossProductCheck method dispatch.

@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
    @SuppressWarnings("unchecked") Task<? extends Serializable> currTask = (Task<? extends Serializable>) nd;
    if (currTask instanceof SparkTask) {
        SparkWork sparkWork = ((SparkTask) currTask).getWork();
        checkShuffleJoin(sparkWork);
        checkMapJoin((SparkTask) currTask);
    } else if (currTask instanceof ConditionalTask) {
        List<Task<? extends Serializable>> taskList = ((ConditionalTask) currTask).getListTasks();
        for (Task<? extends Serializable> task : taskList) {
            dispatch(task, stack, nodeOutputs);
        }
    }
    return null;
}
Also used : SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ArrayList(java.util.ArrayList) List(java.util.List) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork)

Example 8 with SparkWork

use of org.apache.hadoop.hive.ql.plan.SparkWork in project hive by apache.

the class SparkCompiler method setInputFormat.

@Override
protected void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof SparkTask) {
        SparkWork work = ((SparkTask) task).getWork();
        List<BaseWork> all = work.getAllWork();
        for (BaseWork w : all) {
            if (w instanceof MapWork) {
                MapWork mapWork = (MapWork) w;
                HashMap<String, Operator<? extends OperatorDesc>> opMap = mapWork.getAliasToWork();
                if (!opMap.isEmpty()) {
                    for (Operator<? extends OperatorDesc> op : opMap.values()) {
                        setInputFormat(mapWork, op);
                    }
                }
            }
        }
    } else if (task instanceof ConditionalTask) {
        List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
        for (Task<? extends Serializable> tsk : listTasks) {
            setInputFormat(tsk);
        }
    }
    if (task.getChildTasks() != null) {
        for (Task<? extends Serializable> childTask : task.getChildTasks()) {
            setInputFormat(childTask);
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) List(java.util.List) ArrayList(java.util.ArrayList) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 9 with SparkWork

use of org.apache.hadoop.hive.ql.plan.SparkWork in project hive by apache.

the class GenSparkWork method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    Preconditions.checkArgument(context != null, "AssertionError: expected context to be not null");
    Preconditions.checkArgument(context.currentTask != null, "AssertionError: expected context.currentTask to be not null");
    Preconditions.checkArgument(context.currentRootOperator != null, "AssertionError: expected context.currentRootOperator to be not null");
    // Operator is a file sink or reduce sink. Something that forces a new vertex.
    @SuppressWarnings("unchecked") Operator<? extends OperatorDesc> operator = (Operator<? extends OperatorDesc>) nd;
    // root is the start of the operator pipeline we're currently
    // packing into a vertex, typically a table scan, union or join
    Operator<?> root = context.currentRootOperator;
    LOG.debug("Root operator: " + root);
    LOG.debug("Leaf operator: " + operator);
    SparkWork sparkWork = context.currentTask.getWork();
    SMBMapJoinOperator smbOp = GenSparkUtils.getChildOperator(root, SMBMapJoinOperator.class);
    // Right now the work graph is pretty simple. If there is no
    // Preceding work we have a root and will generate a map
    // vertex. If there is a preceding work we will generate
    // a reduce vertex
    BaseWork work;
    if (context.rootToWorkMap.containsKey(root)) {
        // having seen the root operator before means there was a branch in the
        // operator graph. There's typically two reasons for that: a) mux/demux
        // b) multi insert. Mux/Demux will hit the same leaf again, multi insert
        // will result into a vertex with multiple FS or RS operators.
        // At this point we don't have to do anything special in this case. Just
        // run through the regular paces w/o creating a new task.
        work = context.rootToWorkMap.get(root);
    } else {
        // create a new vertex
        if (context.preceedingWork == null) {
            if (smbOp == null) {
                work = utils.createMapWork(context, root, sparkWork, null);
            } else {
                //save work to be initialized later with SMB information.
                work = utils.createMapWork(context, root, sparkWork, null, true);
                context.smbMapJoinCtxMap.get(smbOp).mapWork = (MapWork) work;
            }
        } else {
            work = utils.createReduceWork(context, root, sparkWork);
        }
        context.rootToWorkMap.put(root, work);
    }
    if (!context.childToWorkMap.containsKey(operator)) {
        List<BaseWork> workItems = new LinkedList<BaseWork>();
        workItems.add(work);
        context.childToWorkMap.put(operator, workItems);
    } else {
        context.childToWorkMap.get(operator).add(work);
    }
    // remember which mapjoin operator links with which work
    if (!context.currentMapJoinOperators.isEmpty()) {
        for (MapJoinOperator mj : context.currentMapJoinOperators) {
            LOG.debug("Processing map join: " + mj);
            // remember the mapping in case we scan another branch of the mapjoin later
            if (!context.mapJoinWorkMap.containsKey(mj)) {
                List<BaseWork> workItems = new LinkedList<BaseWork>();
                workItems.add(work);
                context.mapJoinWorkMap.put(mj, workItems);
            } else {
                context.mapJoinWorkMap.get(mj).add(work);
            }
            /*
         * this happens in case of map join operations.
         * The tree looks like this:
         *
         *        RS <--- we are here perhaps
         *        |
         *     MapJoin
         *     /     \
         *   RS       TS
         *  /
         * TS
         *
         * If we are at the RS pointed above, and we may have already visited the
         * RS following the TS, we have already generated work for the TS-RS.
         * We need to hook the current work to this generated work.
         */
            if (context.linkOpWithWorkMap.containsKey(mj)) {
                Map<BaseWork, SparkEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
                if (linkWorkMap != null) {
                    if (context.linkChildOpWithDummyOp.containsKey(mj)) {
                        for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
                            work.addDummyOp((HashTableDummyOperator) dummy);
                        }
                    }
                    for (Entry<BaseWork, SparkEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
                        BaseWork parentWork = parentWorkMap.getKey();
                        LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
                        SparkEdgeProperty edgeProp = parentWorkMap.getValue();
                        sparkWork.connect(parentWork, work, edgeProp);
                        // of the downstream work
                        for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
                            if (r.getConf().getOutputName() != null) {
                                LOG.debug("Cloning reduce sink for multi-child broadcast edge");
                                // we've already set this one up. Need to clone for the next work.
                                r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), r.getParentOperators());
                            }
                            r.getConf().setOutputName(work.getName());
                        }
                    }
                }
            }
        }
        // clear out the set. we don't need it anymore.
        context.currentMapJoinOperators.clear();
    }
    // with this root operator.
    if (root.getNumParent() > 0) {
        Preconditions.checkArgument(work instanceof ReduceWork, "AssertionError: expected work to be a ReduceWork, but was " + work.getClass().getName());
        ReduceWork reduceWork = (ReduceWork) work;
        for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
            Preconditions.checkArgument(parent instanceof ReduceSinkOperator, "AssertionError: expected operator to be a ReduceSinkOperator, but was " + parent.getClass().getName());
            ReduceSinkOperator rsOp = (ReduceSinkOperator) parent;
            SparkEdgeProperty edgeProp = GenSparkUtils.getEdgeProperty(rsOp, reduceWork);
            rsOp.getConf().setOutputName(reduceWork.getName());
            GenMapRedUtils.setKeyAndValueDesc(reduceWork, rsOp);
            context.leafOpToFollowingWorkInfo.put(rsOp, ObjectPair.create(edgeProp, reduceWork));
            LOG.debug("Removing " + parent + " as parent from " + root);
            root.removeParent(parent);
        }
    }
    // the union operators from the operator tree later.
    if (!context.currentUnionOperators.isEmpty()) {
        context.currentUnionOperators.clear();
        context.workWithUnionOperators.add(work);
    }
    // reasons. Roots are data sources, leaves are data sinks. I know.
    if (context.leafOpToFollowingWorkInfo.containsKey(operator)) {
        ObjectPair<SparkEdgeProperty, ReduceWork> childWorkInfo = context.leafOpToFollowingWorkInfo.get(operator);
        SparkEdgeProperty edgeProp = childWorkInfo.getFirst();
        ReduceWork childWork = childWorkInfo.getSecond();
        LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work:" + childWork);
        // we don't want to connect them with the work associated with TS more than once.
        if (sparkWork.getEdgeProperty(work, childWork) == null) {
            sparkWork.connect(work, childWork, edgeProp);
        } else {
            LOG.debug("work " + work.getName() + " is already connected to " + childWork.getName() + " before");
        }
    } else {
        LOG.debug("First pass. Leaf operator: " + operator);
    }
    // the next item will be a new root.
    if (!operator.getChildOperators().isEmpty()) {
        Preconditions.checkArgument(operator.getChildOperators().size() == 1, "AssertionError: expected operator.getChildOperators().size() to be 1, but was " + operator.getChildOperators().size());
        context.parentOfRoot = operator;
        context.currentRootOperator = operator.getChildOperators().get(0);
        context.preceedingWork = work;
    }
    return null;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ArrayList(java.util.ArrayList) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) LinkedList(java.util.LinkedList) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 10 with SparkWork

use of org.apache.hadoop.hive.ql.plan.SparkWork in project hive by apache.

the class SparkSkewJoinProcFactory method supportRuntimeSkewJoin.

private static boolean supportRuntimeSkewJoin(JoinOperator joinOp, ReduceWork reduceWork, Task<? extends Serializable> currTask, HiveConf hiveConf) {
    if (currTask instanceof SparkTask && GenMRSkewJoinProcessor.skewJoinEnabled(hiveConf, joinOp)) {
        SparkWork sparkWork = ((SparkTask) currTask).getWork();
        List<Task<? extends Serializable>> children = currTask.getChildTasks();
        return !joinOp.getConf().isFixedAsSorted() && sparkWork.contains(reduceWork) && (children == null || children.size() <= 1) && SparkMapJoinResolver.getOp(reduceWork, CommonJoinOperator.class).size() == 1;
    }
    return false;
}
Also used : SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork)

Aggregations

SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)13 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)7 Serializable (java.io.Serializable)6 ArrayList (java.util.ArrayList)6 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)6 List (java.util.List)5 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)5 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 Task (org.apache.hadoop.hive.ql.exec.Task)5 SparkTask (org.apache.hadoop.hive.ql.exec.spark.SparkTask)5 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)5 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)4 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)4 SparkEdgeProperty (org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)3 HashTableDummyOperator (org.apache.hadoop.hive.ql.exec.HashTableDummyOperator)3 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)3 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)3 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)3