use of org.apache.tez.dag.api.ProcessorDescriptor in project tez by apache.
the class TestOrderedWordCount method createDAG.
@VisibleForTesting
public DAG createDAG(FileSystem fs, Configuration conf, Map<String, LocalResource> commonLocalResources, Path stagingDir, int dagIndex, String inputPath, String outputPath, boolean generateSplitsInClient, boolean useMRSettings, int intermediateNumReduceTasks, int maxDataLengthThroughIPC, int exceedDataLimit) throws Exception {
Configuration mapStageConf = new JobConf(conf);
mapStageConf.set(MRJobConfig.MAP_CLASS_ATTR, TokenizerMapper.class.getName());
MRHelpers.translateMRConfToTez(mapStageConf, !useMRSettings);
Configuration iReduceStageConf = new JobConf(conf);
// TODO replace with auto-reduce parallelism
iReduceStageConf.setInt(MRJobConfig.NUM_REDUCES, 2);
iReduceStageConf.set(MRJobConfig.REDUCE_CLASS_ATTR, IntSumReducer.class.getName());
iReduceStageConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, Text.class.getName());
iReduceStageConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
iReduceStageConf.setBoolean("mapred.mapper.new-api", true);
MRHelpers.translateMRConfToTez(iReduceStageConf, !useMRSettings);
Configuration finalReduceConf = new JobConf(conf);
finalReduceConf.setInt(MRJobConfig.NUM_REDUCES, 1);
finalReduceConf.set(MRJobConfig.REDUCE_CLASS_ATTR, MyOrderByNoOpReducer.class.getName());
finalReduceConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, IntWritable.class.getName());
finalReduceConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, Text.class.getName());
MRHelpers.translateMRConfToTez(finalReduceConf, !useMRSettings);
MRHelpers.configureMRApiUsage(mapStageConf);
MRHelpers.configureMRApiUsage(iReduceStageConf);
MRHelpers.configureMRApiUsage(finalReduceConf);
List<Vertex> vertices = new ArrayList<Vertex>();
String mapStageHistoryText = TezUtils.convertToHistoryText("Initial Tokenizer Vertex", mapStageConf);
DataSourceDescriptor dsd;
if (generateSplitsInClient) {
mapStageConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class.getName());
mapStageConf.set(FileInputFormat.INPUT_DIR, inputPath);
mapStageConf.setBoolean("mapred.mapper.new-api", true);
dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, stagingDir, true);
} else {
dsd = MRInputLegacy.createConfigBuilder(mapStageConf, TextInputFormat.class, inputPath).build();
}
dsd.getInputDescriptor().setHistoryText(TezUtils.convertToHistoryText("HDFS Input " + inputPath, mapStageConf));
Map<String, String> mapEnv = Maps.newHashMap();
MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, mapEnv, true);
Map<String, String> reduceEnv = Maps.newHashMap();
MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, reduceEnv, false);
Configuration copyMapStageConf = new Configuration(mapStageConf);
setMaxDataLengthConf(copyMapStageConf, maxDataLengthThroughIPC, exceedDataLimit);
Vertex mapVertex;
ProcessorDescriptor mapProcessorDescriptor = ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(copyMapStageConf)).setHistoryText(mapStageHistoryText);
if (!useMRSettings) {
mapVertex = Vertex.create("initialmap", mapProcessorDescriptor);
} else {
mapVertex = Vertex.create("initialmap", mapProcessorDescriptor, -1, MRHelpers.getResourceForMRMapper(mapStageConf));
mapVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRMapper(mapStageConf));
mapVertex.setTaskEnvironment(mapEnv);
}
mapVertex.addTaskLocalFiles(commonLocalResources).addDataSource("MRInput", dsd);
vertices.add(mapVertex);
Configuration copyiReduceStageConf = new Configuration(iReduceStageConf);
setMaxDataLengthConf(copyiReduceStageConf, maxDataLengthThroughIPC, exceedDataLimit);
String iReduceStageHistoryText = TezUtils.convertToHistoryText("Intermediate Summation Vertex", iReduceStageConf);
ProcessorDescriptor iReduceProcessorDescriptor = ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(copyiReduceStageConf)).setHistoryText(iReduceStageHistoryText);
Vertex intermediateVertex;
if (!useMRSettings) {
intermediateVertex = Vertex.create("intermediate_reducer", iReduceProcessorDescriptor, intermediateNumReduceTasks);
} else {
intermediateVertex = Vertex.create("intermediate_reducer", iReduceProcessorDescriptor, intermediateNumReduceTasks, MRHelpers.getResourceForMRReducer(iReduceStageConf));
intermediateVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(iReduceStageConf));
intermediateVertex.setTaskEnvironment(reduceEnv);
}
intermediateVertex.addTaskLocalFiles(commonLocalResources);
vertices.add(intermediateVertex);
Configuration copyFinalReduceConf = new Configuration(finalReduceConf);
setMaxDataLengthConf(copyFinalReduceConf, maxDataLengthThroughIPC, exceedDataLimit);
String finalReduceStageHistoryText = TezUtils.convertToHistoryText("Final Sorter Vertex", finalReduceConf);
UserPayload finalReducePayload = TezUtils.createUserPayloadFromConf(copyFinalReduceConf);
Vertex finalReduceVertex;
ProcessorDescriptor finalReduceProcessorDescriptor = ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(finalReducePayload).setHistoryText(finalReduceStageHistoryText);
if (!useMRSettings) {
finalReduceVertex = Vertex.create("finalreduce", finalReduceProcessorDescriptor, 1);
} else {
finalReduceVertex = Vertex.create("finalreduce", finalReduceProcessorDescriptor, 1, MRHelpers.getResourceForMRReducer(finalReduceConf));
finalReduceVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(finalReduceConf));
finalReduceVertex.setTaskEnvironment(reduceEnv);
}
finalReduceVertex.addTaskLocalFiles(commonLocalResources);
finalReduceVertex.addDataSink("MROutput", MROutputLegacy.createConfigBuilder(finalReduceConf, TextOutputFormat.class, outputPath).build());
finalReduceVertex.getDataSinks().get(0).getOutputDescriptor().setHistoryText(TezUtils.convertToHistoryText("HDFS Output " + outputPath, finalReduceConf));
vertices.add(finalReduceVertex);
DAG dag = DAG.create("OrderedWordCount" + dagIndex);
for (int i = 0; i < vertices.size(); ++i) {
dag.addVertex(vertices.get(i));
}
OrderedPartitionedKVEdgeConfig edgeConf1 = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(iReduceStageConf).configureInput().useLegacyInput().done().build();
dag.addEdge(Edge.create(dag.getVertex("initialmap"), dag.getVertex("intermediate_reducer"), edgeConf1.createDefaultEdgeProperty()));
OrderedPartitionedKVEdgeConfig edgeConf2 = OrderedPartitionedKVEdgeConfig.newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(finalReduceConf).configureInput().useLegacyInput().done().build();
dag.addEdge(Edge.create(dag.getVertex("intermediate_reducer"), dag.getVertex("finalreduce"), edgeConf2.createDefaultEdgeProperty()));
updateDAGACls(conf, dag, dagIndex);
return dag;
}
use of org.apache.tez.dag.api.ProcessorDescriptor in project tez by apache.
the class TestVertexImpl method testFailuresMaxPercentSourceTaskAttemptCompletionEvents.
@Test(timeout = 5000)
public void testFailuresMaxPercentSourceTaskAttemptCompletionEvents() throws TezException {
LOG.info("Testing testFailuresMaxPercentSourceTaskAttemptCompletionEvents");
// Override the basic setup for this test to inject the specific config setting needed for this test
useCustomInitializer = false;
customInitializer = null;
setupPreDagCreation();
conf.setFloat(TezConfiguration.TEZ_VERTEX_FAILURES_MAXPERCENT, 50.0f);
conf.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1);
dagPlan = createTestDAGPlan();
setupPostDagCreation();
initAllVertices(VertexState.INITED);
VertexImpl v4 = vertices.get("vertex4");
VertexImpl v5 = vertices.get("vertex5");
VertexImpl v6 = vertices.get("vertex6");
startVertex(vertices.get("vertex1"));
startVertex(vertices.get("vertex2"));
dispatcher.await();
LOG.info("Verifying v6 state " + v6.getState());
Assert.assertEquals(VertexState.RUNNING, v6.getState());
TezTaskID t1_v4 = TezTaskID.getInstance(v4.getVertexId(), 0);
TezTaskID t2_v4 = TezTaskID.getInstance(v4.getVertexId(), 1);
TezTaskID t1_v5 = TezTaskID.getInstance(v5.getVertexId(), 0);
TezTaskID t2_v5 = TezTaskID.getInstance(v5.getVertexId(), 1);
TezTaskAttemptID ta1_t1_v4 = TezTaskAttemptID.getInstance(t1_v4, 0);
TezTaskAttemptID ta1_t2_v4 = TezTaskAttemptID.getInstance(t2_v4, 0);
TezTaskAttemptID ta1_t1_v5 = TezTaskAttemptID.getInstance(t1_v5, 0);
TezTaskAttemptID ta1_t2_v5 = TezTaskAttemptID.getInstance(t2_v5, 0);
TaskSpec taskSpec = new TaskSpec("dag", "vertex", 2, new ProcessorDescriptor(), new ArrayList<InputSpec>(), new ArrayList<OutputSpec>(), null, conf);
TaskLocationHint locationHint = TaskLocationHint.createTaskLocationHint(null, null);
// Tasks can only succeed from a scheduled or running state
dispatcher.getEventHandler().handle(new TaskEventScheduleTask(t1_v4, taskSpec, locationHint, false));
dispatcher.getEventHandler().handle(new TaskEventScheduleTask(t2_v4, taskSpec, locationHint, false));
// Completed tasks are less that the max percent failure
dispatcher.getEventHandler().handle(new TaskEventTAFailed(ta1_t1_v4, TaskFailureType.NON_FATAL, null));
dispatcher.getEventHandler().handle(new TaskEventTASucceeded(ta1_t2_v4));
dispatcher.getEventHandler().handle(new TaskEventTASucceeded(ta1_t1_v5));
dispatcher.getEventHandler().handle(new TaskEventTAFailed(ta1_t2_v5, TaskFailureType.NON_FATAL, null));
dispatcher.await();
Assert.assertEquals(VertexState.SUCCEEDED, v4.getState());
Assert.assertEquals(VertexState.SUCCEEDED, v5.getState());
Assert.assertEquals(VertexState.RUNNING, v6.getState());
Assert.assertEquals(4, v6.numSuccessSourceAttemptCompletions);
}
use of org.apache.tez.dag.api.ProcessorDescriptor in project tez by apache.
the class MapUtils method createLogicalTask.
public static LogicalIOProcessorRuntimeTask createLogicalTask(FileSystem fs, Path workDir, JobConf jobConf, int mapId, Path mapInput, TezUmbilical umbilical, String dagName, String vertexName, List<InputSpec> inputSpecs, List<OutputSpec> outputSpecs, TezSharedExecutor sharedExecutor) throws Exception {
jobConf.setInputFormat(SequenceFileInputFormat.class);
ProcessorDescriptor mapProcessorDesc = ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf));
Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>();
TaskSpec taskSpec = new TaskSpec(TezTestUtils.getMockTaskAttemptId(0, 0, mapId, 0), dagName, vertexName, -1, mapProcessorDesc, inputSpecs, outputSpecs, null, null);
Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
String auxiliaryService = jobConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
serviceConsumerMetadata.put(auxiliaryService, ShuffleUtils.convertJobTokenToBytes(shuffleToken));
Map<String, String> envMap = new HashMap<String, String>();
ByteBuffer shufflePortBb = ByteBuffer.allocate(4).putInt(0, 8000);
AuxiliaryServiceHelper.setServiceDataIntoEnv(auxiliaryService, shufflePortBb, envMap);
LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(taskSpec, 0, jobConf, new String[] { workDir.toString() }, umbilical, serviceConsumerMetadata, envMap, HashMultimap.<String, String>create(), null, "", new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory(), true, new DefaultHadoopShim(), sharedExecutor);
return task;
}
use of org.apache.tez.dag.api.ProcessorDescriptor in project tez by apache.
the class TestReduceProcessor method testReduceProcessor.
@Test(timeout = 5000)
public void testReduceProcessor() throws Exception {
final String dagName = "mrdag0";
String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName();
JobConf jobConf = new JobConf(defaultConf);
setUpJobConf(jobConf);
MRHelpers.translateMRConfToTez(jobConf);
jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString());
jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);
Path mapInput = new Path(workDir, "map0");
MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput, 10);
InputSpec mapInputSpec = new InputSpec("NullSrcVertex", InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(ByteBuffer.wrap(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)).build().toByteArray()))), 1);
OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex", OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1);
// Run a map
TestUmbilical testUmbilical = new TestUmbilical();
TezSharedExecutor sharedExecutor = new TezSharedExecutor(jobConf);
LogicalIOProcessorRuntimeTask mapTask = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0, mapInput, testUmbilical, dagName, mapVertexName, Collections.singletonList(mapInputSpec), Collections.singletonList(mapOutputSpec), sharedExecutor);
mapTask.initialize();
mapTask.run();
mapTask.close();
// One VME, One DME
Assert.assertEquals(2, testUmbilical.getEvents().size());
Assert.assertEquals(EventType.VERTEX_MANAGER_EVENT, testUmbilical.getEvents().get(0).getEventType());
Assert.assertEquals(EventType.COMPOSITE_DATA_MOVEMENT_EVENT, testUmbilical.getEvents().get(1).getEventType());
CompositeDataMovementEvent cdmEvent = (CompositeDataMovementEvent) testUmbilical.getEvents().get(1).getEvent();
Assert.assertEquals(1, cdmEvent.getCount());
DataMovementEvent dme = cdmEvent.getEvents().iterator().next();
dme.setTargetIndex(0);
LOG.info("Starting reduce...");
JobTokenIdentifier identifier = new JobTokenIdentifier(new Text(dagName));
JobTokenSecretManager jobTokenSecretManager = new JobTokenSecretManager();
Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(identifier, jobTokenSecretManager);
shuffleToken.setService(identifier.getJobId());
jobConf.setOutputFormat(SequenceFileOutputFormat.class);
jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString());
jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_OPTIMIZE_LOCAL_FETCH, true);
FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output"));
ProcessorDescriptor reduceProcessorDesc = ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf));
InputSpec reduceInputSpec = new InputSpec(mapVertexName, InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1);
OutputSpec reduceOutputSpec = new OutputSpec("NullDestinationVertex", OutputDescriptor.create(MROutputLegacy.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1);
// Now run a reduce
TaskSpec taskSpec = new TaskSpec(TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0), dagName, reduceVertexName, -1, reduceProcessorDesc, Collections.singletonList(reduceInputSpec), Collections.singletonList(reduceOutputSpec), null, null);
Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
String auxiliaryService = jobConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
serviceConsumerMetadata.put(auxiliaryService, ShuffleUtils.convertJobTokenToBytes(shuffleToken));
Map<String, String> serviceProviderEnvMap = new HashMap<String, String>();
ByteBuffer shufflePortBb = ByteBuffer.allocate(4).putInt(0, 8000);
AuxiliaryServiceHelper.setServiceDataIntoEnv(auxiliaryService, shufflePortBb, serviceProviderEnvMap);
LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(taskSpec, 0, jobConf, new String[] { workDir.toString() }, new TestUmbilical(), serviceConsumerMetadata, serviceProviderEnvMap, HashMultimap.<String, String>create(), null, "", new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory(), true, new DefaultHadoopShim(), sharedExecutor);
List<Event> destEvents = new LinkedList<Event>();
destEvents.add(dme);
task.initialize();
OrderedGroupedInputLegacy sortedOut = (OrderedGroupedInputLegacy) task.getInputs().values().iterator().next();
sortedOut.handleEvents(destEvents);
task.run();
task.close();
sharedExecutor.shutdownNow();
// MRTask mrTask = (MRTask)t.getProcessor();
// TODO NEWTEZ Verify the partitioner has not been created
// Likely not applicable anymore.
// Assert.assertNull(mrTask.getPartitioner());
// Only a task commit happens, hence the data is still in the temporary directory.
Path reduceOutputDir = new Path(new Path(workDir, "output"), "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0)));
Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf);
LongWritable key = new LongWritable();
Text value = new Text();
long prev = Long.MIN_VALUE;
while (reader.next(key, value)) {
if (prev != Long.MIN_VALUE) {
Assert.assertTrue(prev < key.get());
prev = key.get();
}
}
reader.close();
}
use of org.apache.tez.dag.api.ProcessorDescriptor in project tez by apache.
the class TestTaskExecution2 method createTaskRunner.
private TezTaskRunner2 createTaskRunner(ApplicationId appId, TaskExecutionTestHelpers.TezTaskUmbilicalForTest umbilical, TaskReporter taskReporter, ListeningExecutorService executor, String processorClass, byte[] processorConf, boolean testRunner, boolean updateSysCounters) throws IOException {
TezConfiguration tezConf = new TezConfiguration(defaultConf);
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
Path testDir = new Path(workDir, UUID.randomUUID().toString());
String[] localDirs = new String[] { testDir.toString() };
TezDAGID dagId = TezDAGID.getInstance(appId, 1);
TezVertexID vertexId = TezVertexID.getInstance(dagId, 1);
TezTaskID taskId = TezTaskID.getInstance(vertexId, 1);
TezTaskAttemptID taskAttemptId = TezTaskAttemptID.getInstance(taskId, 1);
ProcessorDescriptor processorDescriptor = ProcessorDescriptor.create(processorClass).setUserPayload(UserPayload.create(ByteBuffer.wrap(processorConf)));
TaskSpec taskSpec = new TaskSpec(taskAttemptId, "dagName", "vertexName", -1, processorDescriptor, new ArrayList<InputSpec>(), new ArrayList<OutputSpec>(), null, null);
TezExecutors sharedExecutor = new TezSharedExecutor(tezConf);
TezTaskRunner2 taskRunner;
if (testRunner) {
taskRunner = new TezTaskRunner2ForTest(tezConf, ugi, localDirs, taskSpec, 1, new HashMap<String, ByteBuffer>(), new HashMap<String, String>(), HashMultimap.<String, String>create(), taskReporter, executor, null, "", new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory(), updateSysCounters, sharedExecutor);
} else {
taskRunner = new TezTaskRunner2(tezConf, ugi, localDirs, taskSpec, 1, new HashMap<String, ByteBuffer>(), new HashMap<String, String>(), HashMultimap.<String, String>create(), taskReporter, executor, null, "", new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory(), updateSysCounters, new DefaultHadoopShim(), sharedExecutor);
}
return taskRunner;
}
Aggregations