use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class YARNRunner method createDAG.
private DAG createDAG(FileSystem fs, JobID jobId, Configuration[] stageConfs, String jobSubmitDir, Credentials ts, Map<String, LocalResource> jobLocalResources) throws IOException {
String jobName = stageConfs[0].get(MRJobConfig.JOB_NAME, YarnConfiguration.DEFAULT_APPLICATION_NAME);
DAG dag = DAG.create(jobName);
LOG.info("Number of stages: " + stageConfs.length);
List<TaskLocationHint> mapInputLocations = getMapLocationHintsFromInputSplits(jobId, fs, stageConfs[0], jobSubmitDir);
List<TaskLocationHint> reduceInputLocations = null;
Vertex[] vertices = new Vertex[stageConfs.length];
for (int i = 0; i < stageConfs.length; i++) {
vertices[i] = createVertexForStage(stageConfs[i], jobLocalResources, i == 0 ? mapInputLocations : reduceInputLocations, i, stageConfs.length);
}
for (int i = 0; i < vertices.length; i++) {
dag.addVertex(vertices[i]);
if (i > 0) {
// Set edge conf based on Input conf (compression etc properties for MapReduce are
// w.r.t Outputs - MAP_OUTPUT_COMPRESS for example)
Map<String, String> partitionerConf = null;
if (stageConfs[i - 1] != null) {
partitionerConf = Maps.newHashMap();
for (Map.Entry<String, String> entry : stageConfs[i - 1]) {
partitionerConf.put(entry.getKey(), entry.getValue());
}
}
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(stageConfs[i - 1].get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS), stageConfs[i - 1].get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS), MRPartitioner.class.getName(), partitionerConf).setFromConfigurationUnfiltered(stageConfs[i - 1]).configureInput().useLegacyInput().done().build();
Edge edge = Edge.create(vertices[i - 1], vertices[i], edgeConf.createDefaultEdgeProperty());
dag.addEdge(edge);
}
}
return dag;
}
use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class JoinValidate method createDag.
@VisibleForTesting
DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions) throws IOException {
DAG dag = DAG.create(getDagName());
if (getDefaultExecutionContext() != null) {
dag.setExecutionContext(getDefaultExecutionContext());
}
// Configuration for intermediate output - shared by Vertex1 and Vertex2
// This should only be setting selective keys from the underlying conf. Fix after there's a
// better mechanism to configure the IOs. The setFromConfiguration call is optional and allows
// overriding the config options with command line parameters.
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
Vertex lhsVertex = Vertex.create(LHS_INPUT_NAME, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource("lhs", MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, lhs.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
setVertexExecutionContext(lhsVertex, getLhsExecutionContext());
Vertex rhsVertex = Vertex.create(RHS_INPUT_NAME, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource("rhs", MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, rhs.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
setVertexExecutionContext(rhsVertex, getRhsExecutionContext());
Vertex joinValidateVertex = Vertex.create("joinvalidate", ProcessorDescriptor.create(JoinValidateProcessor.class.getName()), numPartitions);
setVertexExecutionContext(joinValidateVertex, getValidateExecutionContext());
Edge e1 = Edge.create(lhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
Edge e2 = Edge.create(rhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(joinValidateVertex).addEdge(e1).addEdge(e2);
return dag;
}
use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class OrderedWordCount method createDAG.
public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException {
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping).generateSplitsInAM(!isGenerateSplitInClient).build();
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName()));
tokenizerVertex.addDataSource(INPUT, dataSource);
// Use Text key and IntWritable value to bring counts for each word in the same partition
// The setFromConfiguration call is optional and allows overriding the config options with
// command line parameters.
OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
// This vertex will be reading intermediate data via an input edge and writing intermediate data
// via an output edge.
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions);
// Use IntWritable key and Text value to bring all words with the same count in the same
// partition. The data will be ordered by count and words grouped by count. The
// setFromConfiguration call is optional and allows overriding the config options with
// command line parameters.
OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
// Use 1 task to bring all the data in one place for global sorted order. Essentially the number
// of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(NoOpSorter.class.getName()), 1);
sorterVertex.addDataSink(OUTPUT, dataSink);
// No need to add jar containing this class as assumed to be part of the tez jars.
DAG dag = DAG.create(dagName);
dag.addVertex(tokenizerVertex).addVertex(summationVertex).addVertex(sorterVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, summationEdgeConf.createDefaultEdgeProperty())).addEdge(Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
return dag;
}
use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class WordCount method createDAG.
private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions) throws IOException {
// Create the descriptor that describes the input data to Tez. Using MRInput to read text
// data from the given input path. The TextInputFormat is used to read the text data.
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build();
// Create a descriptor that describes the output data to Tez. Using MROoutput to write text
// data to the given output path. The TextOutputFormat is used to write the text data.
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
// Create a vertex that reads the data from the data source and tokenizes it using the
// TokenProcessor. The number of tasks that will do the work for this vertex will be decided
// using the information provided by the data source descriptor.
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);
// Create the edge that represents the movement and semantics of data between the producer
// Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in
// parallel the tokenized data will be partitioned by word such that a given word goes to the
// same partition. The counts for the words should be grouped together per word. To achieve this
// we can use an edge that contains an input/output pair that handles partitioning and grouping
// of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
// edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
// We specify the key, value and partitioner type. Here the key type is Text (for word), the
// value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
// object. The edge can be configured by configuring the input, output etc individually without
// using this helper. The setFromConfiguration call is optional and allows overriding the config
// options with command line parameters.
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
// Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
// The number of tasks that do the work of this vertex depends on the number of partitions used
// to distribute the sum processing. In this case, its been made configurable via the
// numPartitions parameter.
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions).addDataSink(OUTPUT, dataSink);
// No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise
// we would have to add the jars for this code as local files to the vertices.
// Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
DAG dag = DAG.create("WordCount");
dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
return dag;
}
use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.
the class UnionExample method createDAG.
private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
DAG dag = DAG.create("UnionExample");
int numMaps = -1;
Configuration inputConf = new Configuration(tezConf);
inputConf.setBoolean("mapred.mapper.new-api", false);
inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
Configuration outputConf = new Configuration(tezConf);
outputConf.setBoolean("mapred.reducer.new-api", false);
outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
outputConf.set(FileOutputFormat.OUTDIR, outputPath);
DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
checkerVertex.addDataSink("union", od);
Configuration allPartsConf = new Configuration(tezConf);
DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
checkerVertex.addDataSink("all-parts", od2);
Configuration partsConf = new Configuration(tezConf);
DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
unionVertex.addDataSink("parts", od1);
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
return dag;
}
Aggregations