Search in sources :

Example 6 with Edge

use of org.apache.tez.dag.api.Edge in project tez by apache.

the class TestMRRJobsDAGApi method testMRRSleepJobDagSubmitCore.

public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning, boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM, Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources) throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException {
    LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit().");
    JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig());
    stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1);
    stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1);
    stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1);
    stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    MRHelpers.translateMRConfToTez(stage1Conf);
    MRHelpers.translateMRConfToTez(stage2Conf);
    MRHelpers.translateMRConfToTez(stage3Conf);
    MRHelpers.configureMRApiUsage(stage1Conf);
    MRHelpers.configureMRApiUsage(stage2Conf);
    MRHelpers.configureMRApiUsage(stage3Conf);
    Path remoteStagingDir = remoteFs.makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000))));
    TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);
    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf);
    UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf);
    DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000));
    Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass) : null;
    LOG.info("Using initializer class: " + initializerClass);
    DataSourceDescriptor dsd;
    if (!genSplitsInAM) {
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true);
    } else {
        if (initializerClass == null) {
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build();
        } else {
            InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName());
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).setCustomInitializerDescriptor(iid).build();
        }
    }
    Vertex stage1Vertex = Vertex.create("map", ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload), dsd.getNumberOfShards(), Resource.newInstance(256, 1));
    stage1Vertex.addDataSource("MRInput", dsd);
    Vertex stage2Vertex = Vertex.create("ireduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1, Resource.newInstance(256, 1));
    Vertex stage3Vertex = Vertex.create("reduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1, Resource.newInstance(256, 1));
    stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true);
    DataSinkDescriptor dataSinkDescriptor = MROutputLegacy.createConfigBuilder(stage3Conf, NullOutputFormat.class).build();
    Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty());
    stage3Vertex.addDataSink("MROutput", dataSinkDescriptor);
    // TODO env, resources
    dag.addVertex(stage1Vertex);
    dag.addVertex(stage2Vertex);
    dag.addVertex(stage3Vertex);
    Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload)));
    Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload)));
    dag.addEdge(edge1);
    dag.addEdge(edge2);
    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
    DAGClient dagClient = null;
    boolean reuseSession = reUseTezSession != null;
    TezClient tezSession = null;
    if (!dagViaRPC) {
        Preconditions.checkArgument(reuseSession == false);
    }
    if (!reuseSession) {
        TezConfiguration tempTezconf = new TezConfiguration(tezConf);
        if (!dagViaRPC) {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
        } else {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
        }
        tezSession = TezClient.create("testsession", tempTezconf);
        tezSession.start();
    } else {
        tezSession = reUseTezSession;
    }
    if (!dagViaRPC) {
        // TODO Use utility method post TEZ-205 to figure out AM arguments etc.
        dagClient = tezSession.submitDAG(dag);
    }
    if (dagViaRPC && closeSessionBeforeSubmit) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(mrrTezCluster.getConfig());
        yarnClient.start();
        boolean sentKillSession = false;
        while (true) {
            Thread.sleep(500l);
            ApplicationReport appReport = yarnClient.getApplicationReport(tezSession.getAppMasterApplicationId());
            if (appReport == null) {
                continue;
            }
            YarnApplicationState appState = appReport.getYarnApplicationState();
            if (!sentKillSession) {
                if (appState == YarnApplicationState.RUNNING) {
                    tezSession.stop();
                    sentKillSession = true;
                }
            } else {
                if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED || appState == YarnApplicationState.FAILED) {
                    LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState=" + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus());
                    Assert.assertEquals(YarnApplicationState.FINISHED, appState);
                    Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, appReport.getFinalApplicationStatus());
                    break;
                }
            }
        }
        yarnClient.stop();
        return null;
    }
    if (dagViaRPC) {
        LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId() + " and Dag Name=" + dag.getName());
        if (additionalLocalResources != null) {
            tezSession.addAppMasterLocalFiles(additionalLocalResources);
        }
        dagClient = tezSession.submitDAG(dag);
        Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus());
    }
    DAGStatus dagStatus = dagClient.getDAGStatus(null);
    while (!dagStatus.isCompleted()) {
        LOG.info("Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState());
        Thread.sleep(500l);
        if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) {
            LOG.info("Killing running dag/session");
            if (dagViaRPC) {
                tezSession.stop();
            } else {
                dagClient.tryKillDAG();
            }
        }
        dagStatus = dagClient.getDAGStatus(null);
    }
    if (!reuseSession) {
        tezSession.stop();
    }
    return dagStatus.getState();
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) YarnApplicationState(org.apache.hadoop.yarn.api.records.YarnApplicationState) TezClient(org.apache.tez.client.TezClient) Random(java.util.Random) SleepInputFormat(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepInputFormat) DAGStatus(org.apache.tez.dag.api.client.DAGStatus) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) MapProcessor(org.apache.tez.mapreduce.processor.map.MapProcessor) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) ISleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.ISleepReducer) Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) ReduceProcessor(org.apache.tez.mapreduce.processor.reduce.ReduceProcessor) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) SleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepReducer) ISleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.ISleepReducer) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DAGClient(org.apache.tez.dag.api.client.DAGClient) SleepMapper(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepMapper) MRRSleepJobPartitioner(org.apache.tez.mapreduce.examples.MRRSleepJob.MRRSleepJobPartitioner) Edge(org.apache.tez.dag.api.Edge) NullOutputFormat(org.apache.hadoop.mapreduce.lib.output.NullOutputFormat)

Example 7 with Edge

use of org.apache.tez.dag.api.Edge in project hive by apache.

the class TestTezTask method setUp.

@Before
public void setUp() throws Exception {
    utils = mock(DagUtils.class);
    fs = mock(FileSystem.class);
    path = mock(Path.class);
    when(path.getFileSystem(any())).thenReturn(fs);
    when(utils.getTezDir(any())).thenReturn(path);
    when(utils.createVertex(any(), any(BaseWork.class), any(Path.class), any(TezWork.class), anyMap())).thenAnswer(new Answer<Vertex>() {

        @Override
        public Vertex answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            return Vertex.create(((BaseWork) args[1]).getName(), mock(ProcessorDescriptor.class), 0, mock(Resource.class));
        }
    });
    when(utils.createEdge(any(), any(Vertex.class), any(Vertex.class), any(TezEdgeProperty.class), any(BaseWork.class), any(TezWork.class))).thenAnswer(new Answer<Edge>() {

        @Override
        public Edge answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            return Edge.create((Vertex) args[1], (Vertex) args[2], mock(EdgeProperty.class));
        }
    });
    work = new TezWork("", null);
    mws = new MapWork[] { new MapWork(), new MapWork() };
    rws = new ReduceWork[] { new ReduceWork(), new ReduceWork() };
    work.addAll(mws);
    work.addAll(rws);
    int i = 0;
    for (BaseWork w : work.getAllWork()) {
        w.setName("Work " + (++i));
    }
    op = mock(Operator.class);
    Map<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    map.put("foo", op);
    mws[0].setAliasToWork(map);
    mws[1].setAliasToWork(map);
    Map<Path, List<String>> pathMap = new LinkedHashMap<>();
    List<String> aliasList = new ArrayList<String>();
    aliasList.add("foo");
    pathMap.put(new Path("foo"), aliasList);
    mws[0].setPathToAliases(pathMap);
    mws[1].setPathToAliases(pathMap);
    rws[0].setReducer(op);
    rws[1].setReducer(op);
    TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.SIMPLE_EDGE);
    work.connect(mws[0], rws[0], edgeProp);
    work.connect(mws[1], rws[0], edgeProp);
    work.connect(rws[0], rws[1], edgeProp);
    task = new TezTask(utils);
    task.setWork(work);
    task.setConsole(mock(LogHelper.class));
    QueryPlan mockQueryPlan = mock(QueryPlan.class);
    doReturn(UUID.randomUUID().toString()).when(mockQueryPlan).getQueryId();
    task.setQueryPlan(mockQueryPlan);
    conf = new JobConf();
    appLr = createResource("foo.jar");
    HiveConf hiveConf = new HiveConf();
    hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
    SessionState.start(hiveConf);
    session = mock(TezClient.class);
    sessionState = mock(TezSessionState.class);
    when(sessionState.getSession()).thenReturn(session);
    when(sessionState.reopen()).thenReturn(sessionState);
    when(session.submitDAG(any(DAG.class))).thenThrow(new SessionNotRunning("")).thenReturn(mock(DAGClient.class));
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) Vertex(org.apache.tez.dag.api.Vertex) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) ArrayList(java.util.ArrayList) Mockito.anyString(org.mockito.Mockito.anyString) QueryPlan(org.apache.hadoop.hive.ql.QueryPlan) LinkedHashMap(java.util.LinkedHashMap) TezClient(org.apache.tez.client.TezClient) SessionNotRunning(org.apache.tez.dag.api.SessionNotRunning) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) ArrayList(java.util.ArrayList) HiveConf(org.apache.hadoop.hive.conf.HiveConf) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InvocationOnMock(org.mockito.invocation.InvocationOnMock) DAGClient(org.apache.tez.dag.api.client.DAGClient) Edge(org.apache.tez.dag.api.Edge) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) Before(org.junit.Before)

Example 8 with Edge

use of org.apache.tez.dag.api.Edge in project hive by apache.

the class TezTask method build.

DAG build(JobConf conf, TezWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, Context ctx) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    // getAllWork returns a topologically sorted list, which we use to make
    // sure that vertices are created before they are used in edges.
    List<BaseWork> ws = work.getAllWork();
    Collections.reverse(ws);
    FileSystem fs = scratchDir.getFileSystem(conf);
    // the name of the dag is what is displayed in the AM/Job UI
    String dagName = utils.createDagName(conf, queryPlan);
    LOG.info("Dag name: " + dagName);
    DAG dag = DAG.create(dagName);
    // set some info for the query
    JSONObject json = new JSONObject(new LinkedHashMap()).put("context", "Hive").put("description", ctx.getCmd());
    String dagInfo = json.toString();
    if (LOG.isDebugEnabled()) {
        LOG.debug("DagInfo: " + dagInfo);
    }
    dag.setDAGInfo(dagInfo);
    dag.setCredentials(conf.getCredentials());
    setAccessControlsForCurrentUser(dag, queryPlan.getQueryId(), conf);
    for (BaseWork w : ws) {
        boolean isFinal = work.getLeaves().contains(w);
        // translate work to vertex
        perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
        if (w instanceof UnionWork) {
            // Special case for unions. These items translate to VertexGroups
            List<BaseWork> unionWorkItems = new LinkedList<BaseWork>();
            List<BaseWork> children = new LinkedList<BaseWork>();
            // proper children of the union
            for (BaseWork v : work.getChildren(w)) {
                EdgeType type = work.getEdgeProperty(w, v).getEdgeType();
                if (type == EdgeType.CONTAINS) {
                    unionWorkItems.add(v);
                } else {
                    children.add(v);
                }
            }
            // create VertexGroup
            Vertex[] vertexArray = new Vertex[unionWorkItems.size()];
            int i = 0;
            for (BaseWork v : unionWorkItems) {
                vertexArray[i++] = workToVertex.get(v);
            }
            VertexGroup group = dag.createVertexGroup(w.getName(), vertexArray);
            // For a vertex group, all Outputs use the same Key-class, Val-class and partitioner.
            // Pick any one source vertex to figure out the Edge configuration.
            JobConf parentConf = workToConf.get(unionWorkItems.get(0));
            // now hook up the children
            for (BaseWork v : children) {
                // finally we can create the grouped edge
                GroupInputEdge e = utils.createEdge(group, parentConf, workToVertex.get(v), work.getEdgeProperty(w, v), work.getVertexType(v));
                dag.addEdge(e);
            }
        } else {
            // Regular vertices
            JobConf wxConf = utils.initializeVertexConf(conf, ctx, w);
            Vertex wx = utils.createVertex(wxConf, w, scratchDir, appJarLr, additionalLr, fs, ctx, !isFinal, work, work.getVertexType(w));
            if (w.getReservedMemoryMB() > 0) {
                // If reversedMemoryMB is set, make memory allocation fraction adjustment as needed
                double frac = DagUtils.adjustMemoryReserveFraction(w.getReservedMemoryMB(), super.conf);
                LOG.info("Setting " + TEZ_MEMORY_RESERVE_FRACTION + " to " + frac);
                wx.setConf(TEZ_MEMORY_RESERVE_FRACTION, Double.toString(frac));
            }
            // Otherwise just leave it up to Tez to decide how much memory to allocate
            dag.addVertex(wx);
            utils.addCredentials(w, dag);
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
            workToVertex.put(w, wx);
            workToConf.put(w, wxConf);
            // add all dependencies (i.e.: edges) to the graph
            for (BaseWork v : work.getChildren(w)) {
                assert workToVertex.containsKey(v);
                Edge e = null;
                TezEdgeProperty edgeProp = work.getEdgeProperty(w, v);
                e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp, work.getVertexType(v));
                dag.addEdge(e);
            }
        }
    }
    // Clear the work map after build. TODO: remove caching instead?
    Utilities.clearWorkMap(conf);
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) UnionWork(org.apache.hadoop.hive.ql.plan.UnionWork) DAG(org.apache.tez.dag.api.DAG) EdgeType(org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType) LinkedList(java.util.LinkedList) LinkedHashMap(java.util.LinkedHashMap) VertexGroup(org.apache.tez.dag.api.VertexGroup) JSONObject(org.json.JSONObject) FileSystem(org.apache.hadoop.fs.FileSystem) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) JobConf(org.apache.hadoop.mapred.JobConf) Edge(org.apache.tez.dag.api.Edge) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Example 9 with Edge

use of org.apache.tez.dag.api.Edge in project tez by apache.

the class TestPreemption method createDAG.

DAG createDAG(DataMovementType dmType) {
    DAG dag = DAG.create("test-" + dagCount++);
    Vertex vA = Vertex.create("A", ProcessorDescriptor.create("Proc.class"), 5);
    Vertex vB = Vertex.create("B", ProcessorDescriptor.create("Proc.class"), 5);
    Edge eAB = Edge.create(vA, vB, EdgeProperty.create(dmType, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("O.class"), InputDescriptor.create("I.class")));
    dag.addVertex(vA).addVertex(vB).addEdge(eAB);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) DAG(org.apache.tez.dag.api.DAG) Edge(org.apache.tez.dag.api.Edge)

Example 10 with Edge

use of org.apache.tez.dag.api.Edge in project tez by apache.

the class SortMergeJoinExample method createDag.

/**
 * v1 v2 <br>
 * &nbsp;\&nbsp;/ <br>
 * &nbsp;&nbsp;v3 <br>
 *
 * @param tezConf
 * @param inputPath1
 * @param inputPath2
 * @param outPath
 * @param numPartitions
 * @return dag
 * @throws IOException
 */
private DAG createDag(TezConfiguration tezConf, Path inputPath1, Path inputPath2, Path outPath, int numPartitions) throws IOException {
    DAG dag = DAG.create("SortMergeJoinExample");
    /**
     * This vertex represents the one side of the join. It reads text data using
     * the TextInputFormat. ForwardingProcessor simply forwards the data
     * downstream as is.
     */
    Vertex inputVertex1 = Vertex.create("input1", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    /**
     * The other vertex represents the other side of the join. It reads text
     * data using the TextInputFormat. ForwardingProcessor simply forwards the
     * data downstream as is.
     */
    Vertex inputVertex2 = Vertex.create("input2", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    /**
     * This vertex represents the join operation. It writes the join output as
     * text using the TextOutputFormat. The JoinProcessor is going to perform
     * the join of the two sorted output from inputVertex1 and inputVerex2. It
     * is load balanced across numPartitions.
     */
    Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(SortMergeJoinProcessor.class.getName()), numPartitions).setVertexManagerPlugin(ShuffleVertexManager.createConfigBuilder(tezConf).setAutoReduceParallelism(true).build()).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
    /**
     * The output of inputVertex1 and inputVertex2 will be partitioned into
     * fragments with the same keys going to the same fragments using hash
     * partitioning. The data to be joined is the key itself and so the value is
     * null. And these outputs will be sorted before feeding them to
     * JoinProcessor. The number of fragments is initially inferred from the
     * number of tasks running in the join vertex because each task will be
     * handling one fragment.
     * Edge config options are derived from client-side tez-site.xml (recommended). Optionally
     * invoke setFromConfiguration to override these config options via commandline arguments.
     */
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    /**
     * Connect the join vertex with inputVertex1 with the EdgeProperty created
     * from {@link OrderedPartitionedKVEdgeConfig} so that the output of
     * inputVertex1 is sorted before feeding it to JoinProcessor
     */
    Edge e1 = Edge.create(inputVertex1, joinVertex, edgeConf.createDefaultEdgeProperty());
    /**
     * Connect the join vertex with inputVertex2 with the EdgeProperty created
     * from {@link OrderedPartitionedKVEdgeConfig} so that the output of
     * inputVertex1 is sorted before feeding it to JoinProcessor
     */
    Edge e2 = Edge.create(inputVertex2, joinVertex, edgeConf.createDefaultEdgeProperty());
    dag.addVertex(inputVertex1).addVertex(inputVertex2).addVertex(joinVertex).addEdge(e1).addEdge(e2);
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) ForwardingProcessor(org.apache.tez.examples.HashJoinExample.ForwardingProcessor) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) NullWritable(org.apache.hadoop.io.NullWritable) Edge(org.apache.tez.dag.api.Edge)

Aggregations

Edge (org.apache.tez.dag.api.Edge)12 Vertex (org.apache.tez.dag.api.Vertex)12 DAG (org.apache.tez.dag.api.DAG)11 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 Configuration (org.apache.hadoop.conf.Configuration)6 JobConf (org.apache.hadoop.mapred.JobConf)6 Text (org.apache.hadoop.io.Text)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 TezClient (org.apache.tez.client.TezClient)4 UserPayload (org.apache.tez.dag.api.UserPayload)4 DAGClient (org.apache.tez.dag.api.client.DAGClient)4 TreeMap (java.util.TreeMap)3 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)3 TezEdgeProperty (org.apache.hadoop.hive.ql.plan.TezEdgeProperty)3 NullWritable (org.apache.hadoop.io.NullWritable)3 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)3 DAGStatus (org.apache.tez.dag.api.client.DAGStatus)3 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)3 LinkedHashMap (java.util.LinkedHashMap)2