use of org.apache.tez.examples.HashJoinExample.ForwardingProcessor in project tez by apache.
the class SortMergeJoinExample method createDag.
/**
* v1 v2 <br>
* \ / <br>
* v3 <br>
*
* @param tezConf
* @param inputPath1
* @param inputPath2
* @param outPath
* @param numPartitions
* @return dag
* @throws IOException
*/
private DAG createDag(TezConfiguration tezConf, Path inputPath1, Path inputPath2, Path outPath, int numPartitions) throws IOException {
DAG dag = DAG.create("SortMergeJoinExample");
/**
* This vertex represents the one side of the join. It reads text data using
* the TextInputFormat. ForwardingProcessor simply forwards the data
* downstream as is.
*/
Vertex inputVertex1 = Vertex.create("input1", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* The other vertex represents the other side of the join. It reads text
* data using the TextInputFormat. ForwardingProcessor simply forwards the
* data downstream as is.
*/
Vertex inputVertex2 = Vertex.create("input2", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents the join operation. It writes the join output as
* text using the TextOutputFormat. The JoinProcessor is going to perform
* the join of the two sorted output from inputVertex1 and inputVerex2. It
* is load balanced across numPartitions.
*/
Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(SortMergeJoinProcessor.class.getName()), numPartitions).setVertexManagerPlugin(ShuffleVertexManager.createConfigBuilder(tezConf).setAutoReduceParallelism(true).build()).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
/**
* The output of inputVertex1 and inputVertex2 will be partitioned into
* fragments with the same keys going to the same fragments using hash
* partitioning. The data to be joined is the key itself and so the value is
* null. And these outputs will be sorted before feeding them to
* JoinProcessor. The number of fragments is initially inferred from the
* number of tasks running in the join vertex because each task will be
* handling one fragment.
* Edge config options are derived from client-side tez-site.xml (recommended). Optionally
* invoke setFromConfiguration to override these config options via commandline arguments.
*/
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
/**
* Connect the join vertex with inputVertex1 with the EdgeProperty created
* from {@link OrderedPartitionedKVEdgeConfig} so that the output of
* inputVertex1 is sorted before feeding it to JoinProcessor
*/
Edge e1 = Edge.create(inputVertex1, joinVertex, edgeConf.createDefaultEdgeProperty());
/**
* Connect the join vertex with inputVertex2 with the EdgeProperty created
* from {@link OrderedPartitionedKVEdgeConfig} so that the output of
* inputVertex1 is sorted before feeding it to JoinProcessor
*/
Edge e2 = Edge.create(inputVertex2, joinVertex, edgeConf.createDefaultEdgeProperty());
dag.addVertex(inputVertex1).addVertex(inputVertex2).addVertex(joinVertex).addEdge(e1).addEdge(e2);
return dag;
}
Aggregations