Search in sources :

Example 31 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class DagUtils method createVertex.

private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
    Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
    if (mergeJoinWork.getMainWork() instanceof MapWork) {
        List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
        MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
        Vertex mergeVx = createVertex(conf, mapWork, appJarLr, additionalLr, fs, mrScratchDir, ctx, vertexType);
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set
        // to false when using this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
        for (int i = 0; i < mapWorkList.size(); i++) {
            mapWork = (MapWork) (mapWorkList.get(i));
            conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
            conf.set(Utilities.INPUT_NAME, mapWork.getName());
            LOG.info("Going through each work and adding MultiMRInput");
            mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
        }
        VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
        // the +1 to the size is because of the main work.
        CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1);
        DataOutputBuffer dob = new DataOutputBuffer();
        vertexConf.write(dob);
        byte[] userPayload = dob.getData();
        desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
        mergeVx.setVertexManagerPlugin(desc);
        return mergeVx;
    } else {
        Vertex mergeVx = createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), appJarLr, additionalLr, fs, mrScratchDir, ctx);
        return mergeVx;
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) VertexManagerPluginDescriptor(org.apache.tez.dag.api.VertexManagerPluginDescriptor) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 32 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class SplitSparkWorkResolver method splitSparkWork.

private void splitSparkWork(SparkWork sparkWork) {
    Queue<BaseWork> queue = new LinkedList<BaseWork>();
    Set<BaseWork> visited = new HashSet<BaseWork>();
    queue.addAll(sparkWork.getRoots());
    while (!queue.isEmpty()) {
        BaseWork work = queue.poll();
        if (!visited.add(work)) {
            continue;
        }
        List<BaseWork> childWorks = sparkWork.getChildren(work);
        // First, add all children of this work into queue, to be processed later.
        for (BaseWork w : childWorks) {
            queue.add(w);
        }
        // Second, check if this work has multiple reduceSinks. If so, do split.
        splitBaseWork(sparkWork, work, childWorks);
    }
}
Also used : BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) LinkedList(java.util.LinkedList) HashSet(java.util.HashSet)

Example 33 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class SplitSparkWorkResolver method splitBaseWork.

// Split work into multiple branches, one for each childWork in childWorks.
// It also set up the connection between each parent work and child work.
private void splitBaseWork(SparkWork sparkWork, BaseWork parentWork, List<BaseWork> childWorks) {
    if (getAllReduceSinks(parentWork).size() <= 1) {
        return;
    }
    // Grand-parent works - we need to set these to be the parents of the cloned works.
    List<BaseWork> grandParentWorks = sparkWork.getParents(parentWork);
    boolean isFirst = true;
    for (BaseWork childWork : childWorks) {
        BaseWork clonedParentWork = SerializationUtilities.cloneBaseWork(parentWork);
        // give the cloned work a different name
        clonedParentWork.setName(clonedParentWork.getName().replaceAll("^([a-zA-Z]+)(\\s+)(\\d+)", "$1$2" + GenSparkUtils.getUtils().getNextSeqNumber()));
        setStatistics(parentWork, clonedParentWork);
        String childReducerName = childWork.getName();
        SparkEdgeProperty clonedEdgeProperty = sparkWork.getEdgeProperty(parentWork, childWork);
        // the corresponding ReduceSinkOperator.
        for (Operator<?> op : clonedParentWork.getAllLeafOperators()) {
            if (op instanceof ReduceSinkOperator) {
                if (!((ReduceSinkOperator) op).getConf().getOutputName().equals(childReducerName)) {
                    removeOpRecursive(op);
                }
            } else if (!isFirst) {
                removeOpRecursive(op);
            }
        }
        isFirst = false;
        // Then, we need to set up the graph connection. Especially:
        // 1, we need to connect this cloned parent work with all the grand-parent works.
        // 2, we need to connect this cloned parent work with the corresponding child work.
        sparkWork.add(clonedParentWork);
        for (BaseWork gpw : grandParentWorks) {
            sparkWork.connect(gpw, clonedParentWork, sparkWork.getEdgeProperty(gpw, parentWork));
        }
        sparkWork.connect(clonedParentWork, childWork, clonedEdgeProperty);
        sparkWork.getCloneToWork().put(clonedParentWork, parentWork);
    }
    sparkWork.remove(parentWork);
}
Also used : SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 34 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class GenTezWork method getFollowingWorkIndex.

private int getFollowingWorkIndex(TezWork tezWork, UnionWork unionWork, ReduceSinkOperator rs) throws SemanticException {
    int index = 0;
    for (BaseWork baseWork : tezWork.getChildren(unionWork)) {
        TezEdgeProperty edgeProperty = tezWork.getEdgeProperty(unionWork, baseWork);
        if (edgeProperty.getEdgeType() != TezEdgeProperty.EdgeType.CONTAINS) {
            return index;
        }
        index++;
    }
    throw new SemanticException("Following work not found for the reduce sink: " + rs.getName());
}
Also used : TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 35 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class TestGenTezWork method testCreateMap.

@Test
public void testCreateMap() throws SemanticException {
    proc.process(rs, null, ctx, (Object[]) null);
    assertNotNull(ctx.currentTask);
    assertTrue(ctx.rootTasks.contains(ctx.currentTask));
    TezWork work = ctx.currentTask.getWork();
    assertEquals(work.getAllWork().size(), 1);
    BaseWork w = work.getAllWork().get(0);
    assertTrue(w instanceof MapWork);
    MapWork mw = (MapWork) w;
    // need to make sure names are set for tez to connect things right
    assertNotNull(w.getName());
    // map work should start with our ts op
    assertSame(mw.getAliasToWork().entrySet().iterator().next().getValue(), ts);
    // preceeding work must be set to the newly generated map
    assertSame(ctx.preceedingWork, mw);
    // should have a new root now
    assertSame(ctx.currentRootOperator, fs);
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) Test(org.junit.Test)

Aggregations

BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)54 ArrayList (java.util.ArrayList)16 Operator (org.apache.hadoop.hive.ql.exec.Operator)14 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)14 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)11 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)10 LinkedList (java.util.LinkedList)9 HashTableDummyOperator (org.apache.hadoop.hive.ql.exec.HashTableDummyOperator)9 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)9 TezWork (org.apache.hadoop.hive.ql.plan.TezWork)9 List (java.util.List)8 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)8 JobConf (org.apache.hadoop.mapred.JobConf)8 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 SparkEdgeProperty (org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)7 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)7 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)6 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)6 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)6