Search in sources :

Example 21 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkSkewJoinProcFactory method splitTask.

/**
   * If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
   */
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
    SparkWork currentWork = currentTask.getWork();
    Set<Operator<?>> reduceSinkSet = SparkMapJoinResolver.getOp(reduceWork, ReduceSinkOperator.class);
    if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
        ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
        BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
        SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
        // disconnect the reduce work from its child. this should produce two isolated sub graphs
        currentWork.disconnect(reduceWork, childWork);
        // move works following the current reduce work into a new spark work
        SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        newWork.add(childWork);
        copyWorkGraph(currentWork, newWork, childWork);
        // remove them from current spark work
        for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
            currentWork.remove(baseWork);
            currentWork.getCloneToWork().remove(baseWork);
        }
        // create TS to read intermediate data
        Context baseCtx = parseContext.getContext();
        Path taskTmpDir = baseCtx.getMRTmpPath();
        Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
        TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
        // this will insert FS and TS between the RS and its parent
        TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
        // create new MapWork
        MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
        mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        newWork.add(mapWork);
        newWork.connect(mapWork, childWork, originEdge);
        // setup the new map work
        String streamDesc = taskTmpDir.toUri().toString();
        if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
            Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
            String id = null;
            if (childReducer instanceof JoinOperator) {
                if (parseContext.getJoinOps().contains(childReducer)) {
                    id = ((JoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof MapJoinOperator) {
                if (parseContext.getMapJoinOps().contains(childReducer)) {
                    id = ((MapJoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof SMBMapJoinOperator) {
                if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
                    id = ((SMBMapJoinOperator) childReducer).getConf().getId();
                }
            }
            if (id != null) {
                streamDesc = id + ":$INTNAME";
            } else {
                streamDesc = "$INTNAME";
            }
            String origStreamDesc = streamDesc;
            int pos = 0;
            while (mapWork.getAliasToWork().get(streamDesc) != null) {
                streamDesc = origStreamDesc.concat(String.valueOf(++pos));
            }
        }
        GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
        // insert the new task between current task and its child
        @SuppressWarnings("unchecked") Task<? extends Serializable> newTask = TaskFactory.get(newWork, parseContext.getConf());
        List<Task<? extends Serializable>> childTasks = currentTask.getChildTasks();
        // must have at most one child
        if (childTasks != null && childTasks.size() > 0) {
            Task<? extends Serializable> childTask = childTasks.get(0);
            currentTask.removeDependentTask(childTask);
            newTask.addDependentTask(childTask);
        }
        currentTask.addDependentTask(newTask);
        newTask.setFetchSource(currentTask.isFetchSource());
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Path(org.apache.hadoop.fs.Path) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 22 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SortMergeJoinTaskDispatcher method convertSMBTaskToMapJoinTask.

// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork, int bigTablePosition, SMBMapJoinOperator smbJoinOp) throws UnsupportedEncodingException, SemanticException {
    // deep copy a new mapred work
    MapredWork newWork = SerializationUtilities.clonePlan(origWork);
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext.getParseContext().getConf());
    // generate the map join operator; already checked the map join
    MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
    // The reducer needs to be restored - Consider a query like:
    // select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
    // The reducer contains a groupby, which needs to be restored.
    ReduceWork rWork = newWork.getReduceWork();
    // create the local work for this plan
    MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
    // restore the reducer
    newWork.setReduceWork(rWork);
    return newTask;
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 23 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class GenMapRedUtils method initUnionPlan.

/**
   * Initialize the current union plan.
   *
   * @param op
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          processing context
   */
public static void initUnionPlan(ReduceSinkOperator op, UnionOperator currUnionOp, GenMRProcContext opProcCtx, Task<? extends Serializable> unionTask) throws SemanticException {
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    MapredWork plan = (MapredWork) unionTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
    opTaskMap.put(reducer, unionTask);
    plan.setReduceWork(new ReduceWork());
    plan.getReduceWork().setReducer(reducer);
    plan.getReduceWork().setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();
    plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
    if (needsTagging(plan.getReduceWork())) {
        plan.getReduceWork().setNeedsTagging(true);
    }
    initUnionPlan(opProcCtx, currUnionOp, unionTask, false);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 24 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class GenSparkWork method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    Preconditions.checkArgument(context != null, "AssertionError: expected context to be not null");
    Preconditions.checkArgument(context.currentTask != null, "AssertionError: expected context.currentTask to be not null");
    Preconditions.checkArgument(context.currentRootOperator != null, "AssertionError: expected context.currentRootOperator to be not null");
    // Operator is a file sink or reduce sink. Something that forces a new vertex.
    @SuppressWarnings("unchecked") Operator<? extends OperatorDesc> operator = (Operator<? extends OperatorDesc>) nd;
    // root is the start of the operator pipeline we're currently
    // packing into a vertex, typically a table scan, union or join
    Operator<?> root = context.currentRootOperator;
    LOG.debug("Root operator: " + root);
    LOG.debug("Leaf operator: " + operator);
    SparkWork sparkWork = context.currentTask.getWork();
    SMBMapJoinOperator smbOp = GenSparkUtils.getChildOperator(root, SMBMapJoinOperator.class);
    // Right now the work graph is pretty simple. If there is no
    // Preceding work we have a root and will generate a map
    // vertex. If there is a preceding work we will generate
    // a reduce vertex
    BaseWork work;
    if (context.rootToWorkMap.containsKey(root)) {
        // having seen the root operator before means there was a branch in the
        // operator graph. There's typically two reasons for that: a) mux/demux
        // b) multi insert. Mux/Demux will hit the same leaf again, multi insert
        // will result into a vertex with multiple FS or RS operators.
        // At this point we don't have to do anything special in this case. Just
        // run through the regular paces w/o creating a new task.
        work = context.rootToWorkMap.get(root);
    } else {
        // create a new vertex
        if (context.preceedingWork == null) {
            if (smbOp == null) {
                work = utils.createMapWork(context, root, sparkWork, null);
            } else {
                //save work to be initialized later with SMB information.
                work = utils.createMapWork(context, root, sparkWork, null, true);
                context.smbMapJoinCtxMap.get(smbOp).mapWork = (MapWork) work;
            }
        } else {
            work = utils.createReduceWork(context, root, sparkWork);
        }
        context.rootToWorkMap.put(root, work);
    }
    if (!context.childToWorkMap.containsKey(operator)) {
        List<BaseWork> workItems = new LinkedList<BaseWork>();
        workItems.add(work);
        context.childToWorkMap.put(operator, workItems);
    } else {
        context.childToWorkMap.get(operator).add(work);
    }
    // remember which mapjoin operator links with which work
    if (!context.currentMapJoinOperators.isEmpty()) {
        for (MapJoinOperator mj : context.currentMapJoinOperators) {
            LOG.debug("Processing map join: " + mj);
            // remember the mapping in case we scan another branch of the mapjoin later
            if (!context.mapJoinWorkMap.containsKey(mj)) {
                List<BaseWork> workItems = new LinkedList<BaseWork>();
                workItems.add(work);
                context.mapJoinWorkMap.put(mj, workItems);
            } else {
                context.mapJoinWorkMap.get(mj).add(work);
            }
            /*
         * this happens in case of map join operations.
         * The tree looks like this:
         *
         *        RS <--- we are here perhaps
         *        |
         *     MapJoin
         *     /     \
         *   RS       TS
         *  /
         * TS
         *
         * If we are at the RS pointed above, and we may have already visited the
         * RS following the TS, we have already generated work for the TS-RS.
         * We need to hook the current work to this generated work.
         */
            if (context.linkOpWithWorkMap.containsKey(mj)) {
                Map<BaseWork, SparkEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
                if (linkWorkMap != null) {
                    if (context.linkChildOpWithDummyOp.containsKey(mj)) {
                        for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
                            work.addDummyOp((HashTableDummyOperator) dummy);
                        }
                    }
                    for (Entry<BaseWork, SparkEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
                        BaseWork parentWork = parentWorkMap.getKey();
                        LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
                        SparkEdgeProperty edgeProp = parentWorkMap.getValue();
                        sparkWork.connect(parentWork, work, edgeProp);
                        // of the downstream work
                        for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
                            if (r.getConf().getOutputName() != null) {
                                LOG.debug("Cloning reduce sink for multi-child broadcast edge");
                                // we've already set this one up. Need to clone for the next work.
                                r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), r.getParentOperators());
                            }
                            r.getConf().setOutputName(work.getName());
                        }
                    }
                }
            }
        }
        // clear out the set. we don't need it anymore.
        context.currentMapJoinOperators.clear();
    }
    // with this root operator.
    if (root.getNumParent() > 0) {
        Preconditions.checkArgument(work instanceof ReduceWork, "AssertionError: expected work to be a ReduceWork, but was " + work.getClass().getName());
        ReduceWork reduceWork = (ReduceWork) work;
        for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
            Preconditions.checkArgument(parent instanceof ReduceSinkOperator, "AssertionError: expected operator to be a ReduceSinkOperator, but was " + parent.getClass().getName());
            ReduceSinkOperator rsOp = (ReduceSinkOperator) parent;
            SparkEdgeProperty edgeProp = GenSparkUtils.getEdgeProperty(rsOp, reduceWork);
            rsOp.getConf().setOutputName(reduceWork.getName());
            GenMapRedUtils.setKeyAndValueDesc(reduceWork, rsOp);
            context.leafOpToFollowingWorkInfo.put(rsOp, ObjectPair.create(edgeProp, reduceWork));
            LOG.debug("Removing " + parent + " as parent from " + root);
            root.removeParent(parent);
        }
    }
    // the union operators from the operator tree later.
    if (!context.currentUnionOperators.isEmpty()) {
        context.currentUnionOperators.clear();
        context.workWithUnionOperators.add(work);
    }
    // reasons. Roots are data sources, leaves are data sinks. I know.
    if (context.leafOpToFollowingWorkInfo.containsKey(operator)) {
        ObjectPair<SparkEdgeProperty, ReduceWork> childWorkInfo = context.leafOpToFollowingWorkInfo.get(operator);
        SparkEdgeProperty edgeProp = childWorkInfo.getFirst();
        ReduceWork childWork = childWorkInfo.getSecond();
        LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work:" + childWork);
        // we don't want to connect them with the work associated with TS more than once.
        if (sparkWork.getEdgeProperty(work, childWork) == null) {
            sparkWork.connect(work, childWork, edgeProp);
        } else {
            LOG.debug("work " + work.getName() + " is already connected to " + childWork.getName() + " before");
        }
    } else {
        LOG.debug("First pass. Leaf operator: " + operator);
    }
    // the next item will be a new root.
    if (!operator.getChildOperators().isEmpty()) {
        Preconditions.checkArgument(operator.getChildOperators().size() == 1, "AssertionError: expected operator.getChildOperators().size() to be 1, but was " + operator.getChildOperators().size());
        context.parentOfRoot = operator;
        context.currentRootOperator = operator.getChildOperators().get(0);
        context.preceedingWork = work;
    }
    return null;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ArrayList(java.util.ArrayList) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) LinkedList(java.util.LinkedList) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 25 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SplitSparkWorkResolver method setStatistics.

// we lost statistics & opTraits through cloning, try to get them back
private void setStatistics(BaseWork origin, BaseWork clone) {
    if (origin instanceof MapWork && clone instanceof MapWork) {
        MapWork originMW = (MapWork) origin;
        MapWork cloneMW = (MapWork) clone;
        for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : originMW.getAliasToWork().entrySet()) {
            String alias = entry.getKey();
            Operator<? extends OperatorDesc> cloneOP = cloneMW.getAliasToWork().get(alias);
            if (cloneOP != null) {
                setStatistics(entry.getValue(), cloneOP);
            }
        }
    } else if (origin instanceof ReduceWork && clone instanceof ReduceWork) {
        setStatistics(((ReduceWork) origin).getReducer(), ((ReduceWork) clone).getReducer());
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) Map(java.util.Map) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)34 ArrayList (java.util.ArrayList)12 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)12 Path (org.apache.hadoop.fs.Path)11 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)10 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)9 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)7 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)7 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)6 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)6 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)5 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)5 JobConf (org.apache.hadoop.mapred.JobConf)5 IOException (java.io.IOException)4 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)4