Search in sources :

Example 6 with MergeJoinWork

use of org.apache.hadoop.hive.ql.plan.MergeJoinWork in project hive by apache.

the class DagUtils method createVertex.

/**
   * Create a vertex from a given work object.
   *
   * @param conf JobConf to be used to this execution unit
   * @param work The instance of BaseWork representing the actual work to be performed
   * by this vertex.
   * @param scratchDir HDFS scratch dir for this execution unit.
   * @param appJarLr Local resource for hive-exec.
   * @param additionalLr
   * @param fileSystem FS corresponding to scratchDir and LocalResources
   * @param ctx This query's context
   * @return Vertex
   */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}
Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor)

Example 7 with MergeJoinWork

use of org.apache.hadoop.hive.ql.plan.MergeJoinWork in project hive by apache.

the class MergeJoinProc method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    GenTezProcContext context = (GenTezProcContext) procCtx;
    CommonMergeJoinOperator mergeJoinOp = (CommonMergeJoinOperator) nd;
    if (stack.size() < 2) {
        // safety check for L53 to get parentOp, although it is very unlikely that
        // stack size is less than 2, i.e., there is only one MergeJoinOperator in the stack.
        context.currentMergeJoinOperator = mergeJoinOp;
        return null;
    }
    TezWork tezWork = context.currentTask.getWork();
    @SuppressWarnings("unchecked") Operator<? extends OperatorDesc> parentOp = (Operator<? extends OperatorDesc>) ((stack.get(stack.size() - 2)));
    // we need to set the merge work that has been created as part of the dummy store walk. If a
    // merge work already exists for this merge join operator, add the dummy store work to the
    // merge work. Else create a merge work, add above work to the merge work
    MergeJoinWork mergeWork = null;
    if (context.opMergeJoinWorkMap.containsKey(mergeJoinOp)) {
        // we already have the merge work corresponding to this merge join operator
        mergeWork = context.opMergeJoinWorkMap.get(mergeJoinOp);
    } else {
        mergeWork = new MergeJoinWork();
        tezWork.add(mergeWork);
        context.opMergeJoinWorkMap.put(mergeJoinOp, mergeWork);
    }
    if (!(stack.get(stack.size() - 2) instanceof DummyStoreOperator)) {
        /* this may happen in one of the following case:
      TS[0], FIL[26], SEL[2], DUMMY_STORE[30], MERGEJOIN[29]]
                                              /                              
      TS[3], FIL[27], SEL[5], ---------------
      */
        context.currentMergeJoinOperator = mergeJoinOp;
        mergeWork.setTag(mergeJoinOp.getTagForOperator(parentOp));
        return null;
    }
    // Guaranteed to be just 1 because each DummyStoreOperator can be part of only one work.
    BaseWork parentWork = context.childToWorkMap.get(parentOp).get(0);
    mergeWork.addMergedWork(null, parentWork, context.leafOperatorToFollowingWork);
    mergeWork.setMergeJoinOperator(mergeJoinOp);
    tezWork.setVertexType(mergeWork, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
    for (BaseWork grandParentWork : tezWork.getParents(parentWork)) {
        TezEdgeProperty edgeProp = tezWork.getEdgeProperty(grandParentWork, parentWork);
        tezWork.disconnect(grandParentWork, parentWork);
        tezWork.connect(grandParentWork, mergeWork, edgeProp);
    }
    for (BaseWork childWork : tezWork.getChildren(parentWork)) {
        TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, childWork);
        tezWork.disconnect(parentWork, childWork);
        tezWork.connect(mergeWork, childWork, edgeProp);
    }
    tezWork.remove(parentWork);
    DummyStoreOperator dummyOp = (DummyStoreOperator) (stack.get(stack.size() - 2));
    parentWork.setTag(mergeJoinOp.getTagForOperator(dummyOp));
    mergeJoinOp.getParentOperators().remove(dummyOp);
    dummyOp.getChildOperators().clear();
    return true;
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) GenTezProcContext(org.apache.hadoop.hive.ql.parse.GenTezProcContext) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Aggregations

MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)7 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)6 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)5 ArrayList (java.util.ArrayList)4 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)3 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 TezWork (org.apache.hadoop.hive.ql.plan.TezWork)3 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)3 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)3 StatsPublisher (org.apache.hadoop.hive.ql.stats.StatsPublisher)3 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)3 PreWarmVertex (org.apache.tez.dag.api.PreWarmVertex)3 Vertex (org.apache.tez.dag.api.Vertex)3 LinkedList (java.util.LinkedList)2 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 TezEdgeProperty (org.apache.hadoop.hive.ql.plan.TezEdgeProperty)2 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)2