use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class YARNRunner method configureMRInputWithLegacySplitsGenerated.
@Private
private static DataSourceDescriptor configureMRInputWithLegacySplitsGenerated(Configuration conf, boolean useLegacyInput) {
InputDescriptor inputDescriptor;
try {
inputDescriptor = InputDescriptor.create(useLegacyInput ? MRInputLegacy.class.getName() : MRInput.class.getName()).setUserPayload(MRInputHelpersInternal.createMRInputPayload(conf, null));
} catch (IOException e) {
throw new TezUncheckedException(e);
}
DataSourceDescriptor dsd = DataSourceDescriptor.create(inputDescriptor, null, null);
if (conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) {
dsd.getInputDescriptor().setHistoryText(TezUtils.convertToHistoryText(conf));
}
return dsd;
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class TezClientUtils method setupDAGCredentials.
/**
* Obtains tokens for the DAG based on the list of URIs setup in the DAG. The
* fetched credentials are populated back into the DAG and can be retrieved
* via dag.getCredentials
*
* @param dag
* the dag for which credentials need to be setup
* @param sessionCredentials
* session credentials which have already been obtained, and will be
* required for the DAG
* @param conf
* @throws IOException
*/
@Private
static Credentials setupDAGCredentials(DAG dag, Credentials sessionCredentials, Configuration conf) throws IOException {
Preconditions.checkNotNull(sessionCredentials);
TezCommonUtils.logCredentials(LOG, sessionCredentials, "session");
Credentials dagCredentials = new Credentials();
// All session creds are required for the DAG.
dagCredentials.mergeAll(sessionCredentials);
// Add additional credentials based on any URIs that the user may have specified.
// Obtain Credentials for any paths that the user may have configured.
addFileSystemCredentialsFromURIs(dag.getURIsForCredentials(), dagCredentials, conf);
// Obtain Credentials for the local resources configured on the DAG
try {
Set<Path> lrPaths = new HashSet<Path>();
for (Vertex v : dag.getVertices()) {
for (LocalResource lr : v.getTaskLocalFiles().values()) {
lrPaths.add(ConverterUtils.getPathFromYarnURL(lr.getResource()));
}
List<DataSourceDescriptor> dataSources = v.getDataSources();
for (DataSourceDescriptor dataSource : dataSources) {
addFileSystemCredentialsFromURIs(dataSource.getURIsForCredentials(), dagCredentials, conf);
}
List<DataSinkDescriptor> dataSinks = v.getDataSinks();
for (DataSinkDescriptor dataSink : dataSinks) {
addFileSystemCredentialsFromURIs(dataSink.getURIsForCredentials(), dagCredentials, conf);
}
}
for (LocalResource lr : dag.getTaskLocalFiles().values()) {
lrPaths.add(ConverterUtils.getPathFromYarnURL(lr.getResource()));
}
Path[] paths = lrPaths.toArray(new Path[lrPaths.size()]);
TokenCache.obtainTokensForFileSystems(dagCredentials, paths, conf);
} catch (URISyntaxException e) {
throw new IOException(e);
}
return dagCredentials;
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class TestDAGRecovery method testBasicRecovery.
@Test(timeout = 120000)
public void testBasicRecovery() throws Exception {
DAG dag = MultiAttemptDAG.createDAG("TestBasicRecovery", null);
// add input to v1 to make sure that there will be init events for v1 (TEZ-1345)
DataSourceDescriptor dataSource = DataSourceDescriptor.create(InputDescriptor.create(NoOpInput.class.getName()), InputInitializerDescriptor.create(TestRootInputInitializer.class.getName()), null);
dag.getVertex("v1").addDataSource("Input", dataSource);
runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED);
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class FilterLinesByWord method run.
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Credentials credentials = new Credentials();
boolean generateSplitsInClient = false;
SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser();
try {
generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false);
otherArgs = splitCmdLineParser.getRemainingArgs();
} catch (ParseException e1) {
System.err.println("Invalid options");
printUsage();
return 2;
}
if (otherArgs.length != 3) {
printUsage();
return 2;
}
String inputPath = otherArgs[0];
String outputPath = otherArgs[1];
String filterWord = otherArgs[2];
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(outputPath))) {
System.err.println("Output directory : " + outputPath + " already exists");
return 2;
}
TezConfiguration tezConf = new TezConfiguration(conf);
fs.getWorkingDirectory();
Path stagingDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString());
tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDir.toString());
TezClientUtils.ensureStagingDirExists(tezConf, stagingDir);
String jarPath = ClassUtil.findContainingJar(FilterLinesByWord.class);
if (jarPath == null) {
throw new TezUncheckedException("Could not find any jar containing" + FilterLinesByWord.class.getName() + " in the classpath");
}
Path remoteJarPath = fs.makeQualified(new Path(stagingDir, "dag_job.jar"));
fs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
FileStatus remoteJarStatus = fs.getFileStatus(remoteJarPath);
TokenCache.obtainTokensForNamenodes(credentials, new Path[] { remoteJarPath }, conf);
Map<String, LocalResource> commonLocalResources = new TreeMap<String, LocalResource>();
LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, remoteJarStatus.getLen(), remoteJarStatus.getModificationTime());
commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);
TezClient tezSession = TezClient.create("FilterLinesByWordSession", tezConf, commonLocalResources, credentials);
// Why do I need to start the TezSession.
tezSession.start();
Configuration stage1Conf = new JobConf(conf);
stage1Conf.set(FILTER_PARAM_NAME, filterWord);
Configuration stage2Conf = new JobConf(conf);
stage2Conf.set(FileOutputFormat.OUTDIR, outputPath);
stage2Conf.setBoolean("mapred.mapper.new-api", false);
UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
// Setup stage1 Vertex
Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload)).addTaskLocalFiles(commonLocalResources);
DataSourceDescriptor dsd;
if (generateSplitsInClient) {
// TODO TEZ-1406. Dont' use MRInputLegacy
stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
stage1Conf.setBoolean("mapred.mapper.new-api", false);
dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);
} else {
dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath).groupSplits(false).build();
}
stage1Vertex.addDataSource("MRInput", dsd);
// Setup stage2 Vertex
Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(FilterByWordOutputProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), 1);
stage2Vertex.addTaskLocalFiles(commonLocalResources);
// Configure the Output for stage2
OutputDescriptor od = OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf));
OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(MROutputCommitter.class.getName());
stage2Vertex.addDataSink("MROutput", DataSinkDescriptor.create(od, ocd, null));
UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), TextLongPair.class.getName()).setFromConfiguration(tezConf).build();
DAG dag = DAG.create("FilterLinesByWord");
Edge edge = Edge.create(stage1Vertex, stage2Vertex, edgeConf.createDefaultBroadcastEdgeProperty());
dag.addVertex(stage1Vertex).addVertex(stage2Vertex).addEdge(edge);
LOG.info("Submitting DAG to Tez Session");
DAGClient dagClient = tezSession.submitDAG(dag);
LOG.info("Submitted DAG to Tez Session");
DAGStatus dagStatus = null;
String[] vNames = { "stage1", "stage2" };
try {
while (true) {
dagStatus = dagClient.getDAGStatus(null);
if (dagStatus.getState() == DAGStatus.State.RUNNING || dagStatus.getState() == DAGStatus.State.SUCCEEDED || dagStatus.getState() == DAGStatus.State.FAILED || dagStatus.getState() == DAGStatus.State.KILLED || dagStatus.getState() == DAGStatus.State.ERROR) {
break;
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
// continue;
}
}
while (dagStatus.getState() == DAGStatus.State.RUNNING) {
try {
ExampleDriver.printDAGStatus(dagClient, vNames);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// continue;
}
dagStatus = dagClient.getDAGStatus(null);
} catch (TezException e) {
LOG.error("Failed to get application progress. Exiting");
return -1;
}
}
dagStatus = dagClient.getDAGStatus(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
} finally {
fs.delete(stagingDir, true);
tezSession.stop();
}
ExampleDriver.printDAGStatus(dagClient, vNames, true, true);
LOG.info("Application completed. " + "FinalState=" + dagStatus.getState());
return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1;
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class TestMRInputAMSplitGenerator method testGroupSplitsAndSortSplits.
private void testGroupSplitsAndSortSplits(boolean groupSplitsEnabled, boolean sortSplitsEnabled) throws Exception {
Configuration conf = new Configuration();
String[] splitLengths = new String[50];
for (int i = 0; i < splitLengths.length; i++) {
splitLengths[i] = Integer.toString(1000 * (i + 1));
}
conf.setStrings(SPLITS_LENGTHS, splitLengths);
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, InputFormatForTest.class).groupSplits(groupSplitsEnabled).sortSplits(sortSplitsEnabled).build();
UserPayload userPayload = dataSource.getInputDescriptor().getUserPayload();
InputInitializerContext context = new TezTestUtils.TezRootInputInitializerContextForTest(userPayload);
MRInputAMSplitGenerator splitGenerator = new MRInputAMSplitGenerator(context);
List<Event> events = splitGenerator.initialize();
assertTrue(events.get(0) instanceof InputConfigureVertexTasksEvent);
boolean shuffled = false;
InputSplit previousIs = null;
int numRawInputSplits = 0;
for (int i = 1; i < events.size(); i++) {
assertTrue(events.get(i) instanceof InputDataInformationEvent);
InputDataInformationEvent diEvent = (InputDataInformationEvent) (events.get(i));
assertNull(diEvent.getDeserializedUserPayload());
assertNotNull(diEvent.getUserPayload());
MRSplitProto eventProto = MRSplitProto.parseFrom(ByteString.copyFrom(diEvent.getUserPayload()));
InputSplit is = MRInputUtils.getNewSplitDetailsFromEvent(eventProto, new Configuration());
if (groupSplitsEnabled) {
numRawInputSplits += ((TezGroupedSplit) is).getGroupedSplits().size();
for (InputSplit inputSplit : ((TezGroupedSplit) is).getGroupedSplits()) {
assertTrue(inputSplit instanceof InputSplitForTest);
}
assertTrue(((TezGroupedSplit) is).getGroupedSplits().get(0) instanceof InputSplitForTest);
} else {
numRawInputSplits++;
assertTrue(is instanceof InputSplitForTest);
}
// the splits.
if (previousIs != null) {
if (sortSplitsEnabled) {
assertTrue(is.getLength() <= previousIs.getLength());
} else {
shuffled |= (is.getLength() > previousIs.getLength());
}
}
previousIs = is;
}
assertEquals(splitLengths.length, numRawInputSplits);
if (!sortSplitsEnabled) {
assertTrue(shuffled);
}
}
Aggregations