use of org.apache.tez.dag.api.Edge in project tez by apache.
the class FilterLinesByWordOneToOne method run.
@Override
public int run(String[] otherArgs) throws Exception {
boolean generateSplitsInClient = false;
SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser();
try {
generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false);
otherArgs = splitCmdLineParser.getRemainingArgs();
} catch (ParseException e1) {
System.err.println("Invalid options");
printUsage();
return 2;
}
if (otherArgs.length != 3) {
printUsage();
return 2;
}
String inputPath = otherArgs[0];
String outputPath = otherArgs[1];
String filterWord = otherArgs[2];
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(outputPath))) {
System.err.println("Output directory : " + outputPath + " already exists");
return 2;
}
TezConfiguration tezConf = new TezConfiguration(conf);
fs.getWorkingDirectory();
Path stagingDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString());
tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDir.toString());
TezClientUtils.ensureStagingDirExists(tezConf, stagingDir);
String jarPath = ClassUtil.findContainingJar(FilterLinesByWordOneToOne.class);
if (jarPath == null) {
throw new TezUncheckedException("Could not find any jar containing" + FilterLinesByWordOneToOne.class.getName() + " in the classpath");
}
Path remoteJarPath = fs.makeQualified(new Path(stagingDir, "dag_job.jar"));
fs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
FileStatus remoteJarStatus = fs.getFileStatus(remoteJarPath);
Map<String, LocalResource> commonLocalResources = new TreeMap<String, LocalResource>();
LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, remoteJarStatus.getLen(), remoteJarStatus.getModificationTime());
commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);
TezClient tezSession = TezClient.create("FilterLinesByWordSession", tezConf, commonLocalResources, null);
// Why do I need to start the TezSession.
tezSession.start();
Configuration stage1Conf = new JobConf(conf);
stage1Conf.set(FILTER_PARAM_NAME, filterWord);
Configuration stage2Conf = new JobConf(conf);
stage2Conf.set(FileOutputFormat.OUTDIR, outputPath);
stage2Conf.setBoolean("mapred.mapper.new-api", false);
UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
// Setup stage1 Vertex
Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload)).addTaskLocalFiles(commonLocalResources);
DataSourceDescriptor dsd;
if (generateSplitsInClient) {
// TODO TEZ-1406. Dont' use MRInputLegacy
stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
stage1Conf.setBoolean("mapred.mapper.new-api", false);
dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);
} else {
dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath).groupSplits(false).build();
}
stage1Vertex.addDataSource("MRInput", dsd);
// Setup stage2 Vertex
Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(FilterByWordOutputProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), dsd.getNumberOfShards());
stage2Vertex.addTaskLocalFiles(commonLocalResources);
// Configure the Output for stage2
stage2Vertex.addDataSink("MROutput", DataSinkDescriptor.create(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null));
UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), TextLongPair.class.getName()).setFromConfiguration(tezConf).build();
DAG dag = DAG.create("FilterLinesByWord");
Edge edge = Edge.create(stage1Vertex, stage2Vertex, edgeConf.createDefaultOneToOneEdgeProperty());
dag.addVertex(stage1Vertex).addVertex(stage2Vertex).addEdge(edge);
LOG.info("Submitting DAG to Tez Session");
DAGClient dagClient = tezSession.submitDAG(dag);
LOG.info("Submitted DAG to Tez Session");
DAGStatus dagStatus = null;
String[] vNames = { "stage1", "stage2" };
try {
while (true) {
dagStatus = dagClient.getDAGStatus(null);
if (dagStatus.getState() == DAGStatus.State.RUNNING || dagStatus.getState() == DAGStatus.State.SUCCEEDED || dagStatus.getState() == DAGStatus.State.FAILED || dagStatus.getState() == DAGStatus.State.KILLED || dagStatus.getState() == DAGStatus.State.ERROR) {
break;
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
// continue;
}
}
while (dagStatus.getState() == DAGStatus.State.RUNNING) {
try {
ExampleDriver.printDAGStatus(dagClient, vNames);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// continue;
}
dagStatus = dagClient.getDAGStatus(null);
} catch (TezException e) {
LOG.error("Failed to get application progress. Exiting");
return -1;
}
}
} finally {
fs.delete(stagingDir, true);
tezSession.stop();
}
ExampleDriver.printDAGStatus(dagClient, vNames);
LOG.info("Application completed. " + "FinalState=" + dagStatus.getState());
return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1;
}
use of org.apache.tez.dag.api.Edge in project tez by apache.
the class YARNRunner method createDAG.
private DAG createDAG(FileSystem fs, JobID jobId, Configuration[] stageConfs, String jobSubmitDir, Credentials ts, Map<String, LocalResource> jobLocalResources) throws IOException {
String jobName = stageConfs[0].get(MRJobConfig.JOB_NAME, YarnConfiguration.DEFAULT_APPLICATION_NAME);
DAG dag = DAG.create(jobName);
LOG.info("Number of stages: " + stageConfs.length);
List<TaskLocationHint> mapInputLocations = getMapLocationHintsFromInputSplits(jobId, fs, stageConfs[0], jobSubmitDir);
List<TaskLocationHint> reduceInputLocations = null;
Vertex[] vertices = new Vertex[stageConfs.length];
for (int i = 0; i < stageConfs.length; i++) {
vertices[i] = createVertexForStage(stageConfs[i], jobLocalResources, i == 0 ? mapInputLocations : reduceInputLocations, i, stageConfs.length);
}
for (int i = 0; i < vertices.length; i++) {
dag.addVertex(vertices[i]);
if (i > 0) {
// Set edge conf based on Input conf (compression etc properties for MapReduce are
// w.r.t Outputs - MAP_OUTPUT_COMPRESS for example)
Map<String, String> partitionerConf = null;
if (stageConfs[i - 1] != null) {
partitionerConf = Maps.newHashMap();
for (Map.Entry<String, String> entry : stageConfs[i - 1]) {
partitionerConf.put(entry.getKey(), entry.getValue());
}
}
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(stageConfs[i - 1].get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS), stageConfs[i - 1].get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS), MRPartitioner.class.getName(), partitionerConf).setFromConfigurationUnfiltered(stageConfs[i - 1]).configureInput().useLegacyInput().done().build();
Edge edge = Edge.create(vertices[i - 1], vertices[i], edgeConf.createDefaultEdgeProperty());
dag.addEdge(edge);
}
}
return dag;
}
use of org.apache.tez.dag.api.Edge in project tez by apache.
the class HashJoinExample method createDag.
private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions, boolean doBroadcast) throws IOException {
DAG dag = DAG.create("HashJoinExample" + (doBroadcast ? "-WithBroadcast" : ""));
/**
* This vertex represents the side of the join that will be accumulated in a
* hash table in order to join it against the other side. It reads text data
* using the TextInputFormat. ForwardingProcessor simply forwards the data
* downstream as is.
*/
Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, hashPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents that side of the data that will be streamed and
* joined against the other side that has been accumulated into a hash
* table. It reads text data using the TextInputFormat. ForwardingProcessor
* simply forwards the data downstream as is.
*/
Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, streamPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents the join operation. It writes the join output as
* text using the TextOutputFormat. The JoinProcessor is going to perform
* the join of the streaming side and the hash side. It is load balanced
* across numPartitions
*/
Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(HashJoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
/**
* The streamed side will be partitioned into fragments with the same keys
* going to the same fragments using hash partitioning. The data to be
* joined is the key itself and so the value is null. The number of
* fragments is initially inferred from the number of tasks running in the
* join vertex because each task will be handling one fragment. The
* setFromConfiguration call is optional and allows overriding the config
* options with command line parameters.
*/
UnorderedPartitionedKVEdgeConfig streamConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
/**
* Connect the join vertex with the stream side
*/
Edge e1 = Edge.create(streamFileVertex, joinVertex, streamConf.createDefaultEdgeProperty());
EdgeProperty hashSideEdgeProperty = null;
if (doBroadcast) {
/**
* This option can be used when the hash side is small. We can broadcast
* the entire data to all fragments of the stream side. This avoids
* re-partitioning the fragments of the stream side to match the
* partitioning scheme of the hash side and avoids costly network data
* transfer. However, in this example the stream side is being partitioned
* in both cases for brevity of code. The join task can perform the join
* of its fragment of keys with all the keys of the hash side. Using an
* unpartitioned edge to transfer the complete output of the hash side to
* be broadcasted to all fragments of the streamed side. Again, since the
* data is the key, the value is null. The setFromConfiguration call is
* optional and allows overriding the config options with command line
* parameters.
*/
UnorderedKVEdgeConfig broadcastConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName()).setFromConfiguration(tezConf).build();
hashSideEdgeProperty = broadcastConf.createDefaultBroadcastEdgeProperty();
} else {
/**
* The hash side is also being partitioned into fragments with the same
* key going to the same fragment using hash partitioning. This way all
* keys with the same hash value will go to the same fragment from both
* sides. Thus the join task handling that fragment can join both data set
* fragments.
*/
hashSideEdgeProperty = streamConf.createDefaultEdgeProperty();
}
/**
* Connect the join vertex to the hash side. The join vertex is connected
* with 2 upstream vertices that provide it with inputs
*/
Edge e2 = Edge.create(hashFileVertex, joinVertex, hashSideEdgeProperty);
/**
* Connect everything up by adding them to the DAG
*/
dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(joinVertex).addEdge(e1).addEdge(e2);
return dag;
}
use of org.apache.tez.dag.api.Edge in project tez by apache.
the class JoinValidate method createDag.
@VisibleForTesting
DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions) throws IOException {
DAG dag = DAG.create(getDagName());
if (getDefaultExecutionContext() != null) {
dag.setExecutionContext(getDefaultExecutionContext());
}
// Configuration for intermediate output - shared by Vertex1 and Vertex2
// This should only be setting selective keys from the underlying conf. Fix after there's a
// better mechanism to configure the IOs. The setFromConfiguration call is optional and allows
// overriding the config options with command line parameters.
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
Vertex lhsVertex = Vertex.create(LHS_INPUT_NAME, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource("lhs", MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, lhs.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
setVertexExecutionContext(lhsVertex, getLhsExecutionContext());
Vertex rhsVertex = Vertex.create(RHS_INPUT_NAME, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource("rhs", MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, rhs.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
setVertexExecutionContext(rhsVertex, getRhsExecutionContext());
Vertex joinValidateVertex = Vertex.create("joinvalidate", ProcessorDescriptor.create(JoinValidateProcessor.class.getName()), numPartitions);
setVertexExecutionContext(joinValidateVertex, getValidateExecutionContext());
Edge e1 = Edge.create(lhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
Edge e2 = Edge.create(rhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(joinValidateVertex).addEdge(e1).addEdge(e2);
return dag;
}
use of org.apache.tez.dag.api.Edge in project tez by apache.
the class TestFaultTolerance method testCartesianProduct.
/**
* In unpartitioned cartesian product, failure fraction should be #unique failure/#consumer that
* depends on the src task. Here we test a 2x2 cartesian product and let 4th destination task fail.
* The failure fraction limit is configured to be 0.25. So the failure fraction should be 1/2,
* not 1/4.
* @throws Exception
*/
@Test
public void testCartesianProduct() throws Exception {
Configuration dagConf = new Configuration();
dagConf.setDouble(TezConfiguration.TEZ_TASK_MAX_ALLOWED_OUTPUT_FAILURES_FRACTION, 0.25);
DAG dag = DAG.create("dag");
Configuration vertexConf = new Configuration();
vertexConf.setInt(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_VERIFY_TASK_INDEX, "v3"), 3);
vertexConf.setInt(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_VERIFY_VALUE, "v3"), 5);
UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(vertexConf);
ProcessorDescriptor processorDescriptor = ProcessorDescriptor.create(TestProcessor.class.getName()).setUserPayload(vertexPayload);
Vertex v1 = Vertex.create("v1", processorDescriptor, 2);
Vertex v2 = Vertex.create("v2", processorDescriptor, 2);
Vertex v3 = Vertex.create("v3", processorDescriptor);
String[] sourceVertices = { "v1", "v2" };
CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
TezConfiguration tezConf = new TezConfiguration();
tezConf.setInt(CartesianProductVertexManager.TEZ_CARTESIAN_PRODUCT_NUM_PARTITIONS, 1);
tezConf.setBoolean(CartesianProductVertexManager.TEZ_CARTESIAN_PRODUCT_ENABLE_GROUPING, false);
UserPayload cartesianProductPayload = cartesianProductConfig.toUserPayload(tezConf);
v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cartesianProductPayload));
EdgeManagerPluginDescriptor edgeManagerPluginDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()).setUserPayload(cartesianProductPayload);
Configuration inputConf = new Configuration();
inputConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), 3);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), 0);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), 0);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"), 0);
UserPayload inputPayload = TezUtils.createUserPayloadFromConf(inputConf);
EdgeProperty edgeProperty = EdgeProperty.create(edgeManagerPluginDescriptor, DataMovementType.CUSTOM, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, TestOutput.getOutputDesc(null), TestInput.getInputDesc(inputPayload));
Edge e1 = Edge.create(v1, v3, edgeProperty);
Edge e2 = Edge.create(v2, v3, edgeProperty);
dag.addVertex(v1).addVertex(v2).addVertex(v3);
dag.addEdge(e1).addEdge(e2);
// run dag
runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED);
}
Aggregations