use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class SortMergeJoinExample method createDag.
/**
* v1 v2 <br>
* \ / <br>
* v3 <br>
*
* @param tezConf
* @param inputPath1
* @param inputPath2
* @param outPath
* @param numPartitions
* @return dag
* @throws IOException
*/
private DAG createDag(TezConfiguration tezConf, Path inputPath1, Path inputPath2, Path outPath, int numPartitions) throws IOException {
DAG dag = DAG.create("SortMergeJoinExample");
/**
* This vertex represents the one side of the join. It reads text data using
* the TextInputFormat. ForwardingProcessor simply forwards the data
* downstream as is.
*/
Vertex inputVertex1 = Vertex.create("input1", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* The other vertex represents the other side of the join. It reads text
* data using the TextInputFormat. ForwardingProcessor simply forwards the
* data downstream as is.
*/
Vertex inputVertex2 = Vertex.create("input2", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents the join operation. It writes the join output as
* text using the TextOutputFormat. The JoinProcessor is going to perform
* the join of the two sorted output from inputVertex1 and inputVerex2. It
* is load balanced across numPartitions.
*/
Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(SortMergeJoinProcessor.class.getName()), numPartitions).setVertexManagerPlugin(ShuffleVertexManager.createConfigBuilder(tezConf).setAutoReduceParallelism(true).build()).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
/**
* The output of inputVertex1 and inputVertex2 will be partitioned into
* fragments with the same keys going to the same fragments using hash
* partitioning. The data to be joined is the key itself and so the value is
* null. And these outputs will be sorted before feeding them to
* JoinProcessor. The number of fragments is initially inferred from the
* number of tasks running in the join vertex because each task will be
* handling one fragment.
* Edge config options are derived from client-side tez-site.xml (recommended). Optionally
* invoke setFromConfiguration to override these config options via commandline arguments.
*/
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
/**
* Connect the join vertex with inputVertex1 with the EdgeProperty created
* from {@link OrderedPartitionedKVEdgeConfig} so that the output of
* inputVertex1 is sorted before feeding it to JoinProcessor
*/
Edge e1 = Edge.create(inputVertex1, joinVertex, edgeConf.createDefaultEdgeProperty());
/**
* Connect the join vertex with inputVertex2 with the EdgeProperty created
* from {@link OrderedPartitionedKVEdgeConfig} so that the output of
* inputVertex1 is sorted before feeding it to JoinProcessor
*/
Edge e2 = Edge.create(inputVertex2, joinVertex, edgeConf.createDefaultEdgeProperty());
dag.addVertex(inputVertex1).addVertex(inputVertex2).addVertex(joinVertex).addEdge(e1).addEdge(e2);
return dag;
}
use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class TestTezJobs method testVertexFailuresMaxPercent.
@Test(timeout = 60000)
public void testVertexFailuresMaxPercent() throws TezException, InterruptedException, IOException {
TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
tezConf.set(TezConfiguration.TEZ_VERTEX_FAILURES_MAXPERCENT, "50.0f");
tezConf.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1);
TezClient tezClient = TezClient.create("TestVertexFailuresMaxPercent", tezConf);
tezClient.start();
try {
DAG dag = DAG.create("TestVertexFailuresMaxPercent");
Vertex vertex1 = Vertex.create("Parent", ProcessorDescriptor.create(FailingAttemptProcessor.class.getName()), 2);
Vertex vertex2 = Vertex.create("Child", ProcessorDescriptor.create(FailingAttemptProcessor.class.getName()), 2);
OrderedPartitionedKVEdgeConfig edgeConfig = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
dag.addVertex(vertex1).addVertex(vertex2).addEdge(Edge.create(vertex1, vertex2, edgeConfig.createDefaultEdgeProperty()));
DAGClient dagClient = tezClient.submitDAG(dag);
dagClient.waitForCompletion();
Assert.assertEquals(DAGStatus.State.SUCCEEDED, dagClient.getDAGStatus(null).getState());
} finally {
tezClient.stop();
}
}
use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class TestHistoryParser method runWordCount.
private String runWordCount(String tokenizerProcessor, String summationProcessor, String dagName, boolean withTimeline) throws Exception {
// HDFS path
Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, TextInputFormat.class, inputLoc.toString()).build();
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(tokenizerProcessor)).addDataSource(INPUT, dataSource);
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);
// Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
DAG dag = DAG.create(dagName);
dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
TezClient tezClient = getTezClient(withTimeline);
// Update Caller Context
CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
ApplicationId appId = tezClient.getAppMasterApplicationId();
if (appId == null) {
appId = ApplicationId.newInstance(1001l, 1);
}
callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
dag.setCallerContext(callerContext);
DAGClient client = tezClient.submitDAG(dag);
client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);
if (tezClient != null) {
tezClient.stop();
}
return tezDAGID.toString();
}
Aggregations