Search in sources :

Example 16 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class YARNRunner method configureMRInputWithLegacySplitsGenerated.

@Private
private static DataSourceDescriptor configureMRInputWithLegacySplitsGenerated(Configuration conf, boolean useLegacyInput) {
    InputDescriptor inputDescriptor;
    try {
        inputDescriptor = InputDescriptor.create(useLegacyInput ? MRInputLegacy.class.getName() : MRInput.class.getName()).setUserPayload(MRInputHelpersInternal.createMRInputPayload(conf, null));
    } catch (IOException e) {
        throw new TezUncheckedException(e);
    }
    DataSourceDescriptor dsd = DataSourceDescriptor.create(inputDescriptor, null, null);
    if (conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) {
        dsd.getInputDescriptor().setHistoryText(TezUtils.convertToHistoryText(conf));
    }
    return dsd;
}
Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) IOException(java.io.IOException) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) LimitedPrivate(org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate) Private(org.apache.hadoop.classification.InterfaceAudience.Private)

Example 17 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TezClientUtils method setupDAGCredentials.

/**
 * Obtains tokens for the DAG based on the list of URIs setup in the DAG. The
 * fetched credentials are populated back into the DAG and can be retrieved
 * via dag.getCredentials
 *
 * @param dag
 *          the dag for which credentials need to be setup
 * @param sessionCredentials
 *          session credentials which have already been obtained, and will be
 *          required for the DAG
 * @param conf
 * @throws IOException
 */
@Private
static Credentials setupDAGCredentials(DAG dag, Credentials sessionCredentials, Configuration conf) throws IOException {
    Preconditions.checkNotNull(sessionCredentials);
    TezCommonUtils.logCredentials(LOG, sessionCredentials, "session");
    Credentials dagCredentials = new Credentials();
    // All session creds are required for the DAG.
    dagCredentials.mergeAll(sessionCredentials);
    // Add additional credentials based on any URIs that the user may have specified.
    // Obtain Credentials for any paths that the user may have configured.
    addFileSystemCredentialsFromURIs(dag.getURIsForCredentials(), dagCredentials, conf);
    // Obtain Credentials for the local resources configured on the DAG
    try {
        Set<Path> lrPaths = new HashSet<Path>();
        for (Vertex v : dag.getVertices()) {
            for (LocalResource lr : v.getTaskLocalFiles().values()) {
                lrPaths.add(ConverterUtils.getPathFromYarnURL(lr.getResource()));
            }
            List<DataSourceDescriptor> dataSources = v.getDataSources();
            for (DataSourceDescriptor dataSource : dataSources) {
                addFileSystemCredentialsFromURIs(dataSource.getURIsForCredentials(), dagCredentials, conf);
            }
            List<DataSinkDescriptor> dataSinks = v.getDataSinks();
            for (DataSinkDescriptor dataSink : dataSinks) {
                addFileSystemCredentialsFromURIs(dataSink.getURIsForCredentials(), dagCredentials, conf);
            }
        }
        for (LocalResource lr : dag.getTaskLocalFiles().values()) {
            lrPaths.add(ConverterUtils.getPathFromYarnURL(lr.getResource()));
        }
        Path[] paths = lrPaths.toArray(new Path[lrPaths.size()]);
        TokenCache.obtainTokensForFileSystems(dagCredentials, paths, conf);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    return dagCredentials;
}
Also used : Path(org.apache.hadoop.fs.Path) Vertex(org.apache.tez.dag.api.Vertex) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) Credentials(org.apache.hadoop.security.Credentials) HashSet(java.util.HashSet) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) Private(org.apache.hadoop.classification.InterfaceAudience.Private)

Example 18 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TestDAGRecovery method testBasicRecovery.

@Test(timeout = 120000)
public void testBasicRecovery() throws Exception {
    DAG dag = MultiAttemptDAG.createDAG("TestBasicRecovery", null);
    // add input to v1 to make sure that there will be init events for v1 (TEZ-1345)
    DataSourceDescriptor dataSource = DataSourceDescriptor.create(InputDescriptor.create(NoOpInput.class.getName()), InputInitializerDescriptor.create(TestRootInputInitializer.class.getName()), null);
    dag.getVertex("v1").addDataSource("Input", dataSource);
    runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED);
}
Also used : MultiAttemptDAG(org.apache.tez.test.dag.MultiAttemptDAG) DAG(org.apache.tez.dag.api.DAG) SimpleVTestDAG(org.apache.tez.test.dag.SimpleVTestDAG) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) Test(org.junit.Test)

Example 19 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class FilterLinesByWord method run.

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    Credentials credentials = new Credentials();
    boolean generateSplitsInClient = false;
    SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser();
    try {
        generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false);
        otherArgs = splitCmdLineParser.getRemainingArgs();
    } catch (ParseException e1) {
        System.err.println("Invalid options");
        printUsage();
        return 2;
    }
    if (otherArgs.length != 3) {
        printUsage();
        return 2;
    }
    String inputPath = otherArgs[0];
    String outputPath = otherArgs[1];
    String filterWord = otherArgs[2];
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(outputPath))) {
        System.err.println("Output directory : " + outputPath + " already exists");
        return 2;
    }
    TezConfiguration tezConf = new TezConfiguration(conf);
    fs.getWorkingDirectory();
    Path stagingDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDir.toString());
    TezClientUtils.ensureStagingDirExists(tezConf, stagingDir);
    String jarPath = ClassUtil.findContainingJar(FilterLinesByWord.class);
    if (jarPath == null) {
        throw new TezUncheckedException("Could not find any jar containing" + FilterLinesByWord.class.getName() + " in the classpath");
    }
    Path remoteJarPath = fs.makeQualified(new Path(stagingDir, "dag_job.jar"));
    fs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
    FileStatus remoteJarStatus = fs.getFileStatus(remoteJarPath);
    TokenCache.obtainTokensForNamenodes(credentials, new Path[] { remoteJarPath }, conf);
    Map<String, LocalResource> commonLocalResources = new TreeMap<String, LocalResource>();
    LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, remoteJarStatus.getLen(), remoteJarStatus.getModificationTime());
    commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);
    TezClient tezSession = TezClient.create("FilterLinesByWordSession", tezConf, commonLocalResources, credentials);
    // Why do I need to start the TezSession.
    tezSession.start();
    Configuration stage1Conf = new JobConf(conf);
    stage1Conf.set(FILTER_PARAM_NAME, filterWord);
    Configuration stage2Conf = new JobConf(conf);
    stage2Conf.set(FileOutputFormat.OUTDIR, outputPath);
    stage2Conf.setBoolean("mapred.mapper.new-api", false);
    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    // Setup stage1 Vertex
    Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload)).addTaskLocalFiles(commonLocalResources);
    DataSourceDescriptor dsd;
    if (generateSplitsInClient) {
        // TODO TEZ-1406. Dont' use MRInputLegacy
        stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
        stage1Conf.setBoolean("mapred.mapper.new-api", false);
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);
    } else {
        dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath).groupSplits(false).build();
    }
    stage1Vertex.addDataSource("MRInput", dsd);
    // Setup stage2 Vertex
    Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(FilterByWordOutputProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), 1);
    stage2Vertex.addTaskLocalFiles(commonLocalResources);
    // Configure the Output for stage2
    OutputDescriptor od = OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf));
    OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(MROutputCommitter.class.getName());
    stage2Vertex.addDataSink("MROutput", DataSinkDescriptor.create(od, ocd, null));
    UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), TextLongPair.class.getName()).setFromConfiguration(tezConf).build();
    DAG dag = DAG.create("FilterLinesByWord");
    Edge edge = Edge.create(stage1Vertex, stage2Vertex, edgeConf.createDefaultBroadcastEdgeProperty());
    dag.addVertex(stage1Vertex).addVertex(stage2Vertex).addEdge(edge);
    LOG.info("Submitting DAG to Tez Session");
    DAGClient dagClient = tezSession.submitDAG(dag);
    LOG.info("Submitted DAG to Tez Session");
    DAGStatus dagStatus = null;
    String[] vNames = { "stage1", "stage2" };
    try {
        while (true) {
            dagStatus = dagClient.getDAGStatus(null);
            if (dagStatus.getState() == DAGStatus.State.RUNNING || dagStatus.getState() == DAGStatus.State.SUCCEEDED || dagStatus.getState() == DAGStatus.State.FAILED || dagStatus.getState() == DAGStatus.State.KILLED || dagStatus.getState() == DAGStatus.State.ERROR) {
                break;
            }
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
            // continue;
            }
        }
        while (dagStatus.getState() == DAGStatus.State.RUNNING) {
            try {
                ExampleDriver.printDAGStatus(dagClient, vNames);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e) {
                // continue;
                }
                dagStatus = dagClient.getDAGStatus(null);
            } catch (TezException e) {
                LOG.error("Failed to get application progress. Exiting");
                return -1;
            }
        }
        dagStatus = dagClient.getDAGStatus(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
    } finally {
        fs.delete(stagingDir, true);
        tezSession.stop();
    }
    ExampleDriver.printDAGStatus(dagClient, vNames, true, true);
    LOG.info("Application completed. " + "FinalState=" + dagStatus.getState());
    return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1;
}
Also used : TezException(org.apache.tez.dag.api.TezException) Vertex(org.apache.tez.dag.api.Vertex) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) FilterByWordOutputProcessor(org.apache.tez.mapreduce.examples.processor.FilterByWordOutputProcessor) TezClient(org.apache.tez.client.TezClient) UnorderedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) FileSystem(org.apache.hadoop.fs.FileSystem) DAGStatus(org.apache.tez.dag.api.client.DAGStatus) JobConf(org.apache.hadoop.mapred.JobConf) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) Path(org.apache.hadoop.fs.Path) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) UserPayload(org.apache.tez.dag.api.UserPayload) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) TreeMap(java.util.TreeMap) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SplitsInClientOptionParser(org.apache.tez.mapreduce.examples.helpers.SplitsInClientOptionParser) DAGClient(org.apache.tez.dag.api.client.DAGClient) ParseException(org.apache.commons.cli.ParseException) MROutputCommitter(org.apache.tez.mapreduce.committer.MROutputCommitter) Edge(org.apache.tez.dag.api.Edge) Credentials(org.apache.hadoop.security.Credentials) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 20 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TestMRInputAMSplitGenerator method testGroupSplitsAndSortSplits.

private void testGroupSplitsAndSortSplits(boolean groupSplitsEnabled, boolean sortSplitsEnabled) throws Exception {
    Configuration conf = new Configuration();
    String[] splitLengths = new String[50];
    for (int i = 0; i < splitLengths.length; i++) {
        splitLengths[i] = Integer.toString(1000 * (i + 1));
    }
    conf.setStrings(SPLITS_LENGTHS, splitLengths);
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, InputFormatForTest.class).groupSplits(groupSplitsEnabled).sortSplits(sortSplitsEnabled).build();
    UserPayload userPayload = dataSource.getInputDescriptor().getUserPayload();
    InputInitializerContext context = new TezTestUtils.TezRootInputInitializerContextForTest(userPayload);
    MRInputAMSplitGenerator splitGenerator = new MRInputAMSplitGenerator(context);
    List<Event> events = splitGenerator.initialize();
    assertTrue(events.get(0) instanceof InputConfigureVertexTasksEvent);
    boolean shuffled = false;
    InputSplit previousIs = null;
    int numRawInputSplits = 0;
    for (int i = 1; i < events.size(); i++) {
        assertTrue(events.get(i) instanceof InputDataInformationEvent);
        InputDataInformationEvent diEvent = (InputDataInformationEvent) (events.get(i));
        assertNull(diEvent.getDeserializedUserPayload());
        assertNotNull(diEvent.getUserPayload());
        MRSplitProto eventProto = MRSplitProto.parseFrom(ByteString.copyFrom(diEvent.getUserPayload()));
        InputSplit is = MRInputUtils.getNewSplitDetailsFromEvent(eventProto, new Configuration());
        if (groupSplitsEnabled) {
            numRawInputSplits += ((TezGroupedSplit) is).getGroupedSplits().size();
            for (InputSplit inputSplit : ((TezGroupedSplit) is).getGroupedSplits()) {
                assertTrue(inputSplit instanceof InputSplitForTest);
            }
            assertTrue(((TezGroupedSplit) is).getGroupedSplits().get(0) instanceof InputSplitForTest);
        } else {
            numRawInputSplits++;
            assertTrue(is instanceof InputSplitForTest);
        }
        // the splits.
        if (previousIs != null) {
            if (sortSplitsEnabled) {
                assertTrue(is.getLength() <= previousIs.getLength());
            } else {
                shuffled |= (is.getLength() > previousIs.getLength());
            }
        }
        previousIs = is;
    }
    assertEquals(splitLengths.length, numRawInputSplits);
    if (!sortSplitsEnabled) {
        assertTrue(shuffled);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) UserPayload(org.apache.tez.dag.api.UserPayload) TezGroupedSplit(org.apache.hadoop.mapreduce.split.TezGroupedSplit) ByteString(com.google.protobuf.ByteString) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputSplit(org.apache.hadoop.mapreduce.InputSplit) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Aggregations

DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)24 Vertex (org.apache.tez.dag.api.Vertex)14 Configuration (org.apache.hadoop.conf.Configuration)10 Path (org.apache.hadoop.fs.Path)10 DAG (org.apache.tez.dag.api.DAG)10 UserPayload (org.apache.tez.dag.api.UserPayload)10 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)8 IOException (java.io.IOException)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 Test (org.junit.Test)7 IntWritable (org.apache.hadoop.io.IntWritable)5 Text (org.apache.hadoop.io.Text)5 JobConf (org.apache.hadoop.mapred.JobConf)5 InputDescriptor (org.apache.tez.dag.api.InputDescriptor)5 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)5 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 TezClient (org.apache.tez.client.TezClient)4