Examples with OrderedPartitionedKVEdgeConfig - org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig

Example 6 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class YARNRunner method createDAG.

private DAG createDAG(FileSystem fs, JobID jobId, Configuration[] stageConfs, String jobSubmitDir, Credentials ts, Map<String, LocalResource> jobLocalResources) throws IOException {
    String jobName = stageConfs[0].get(MRJobConfig.JOB_NAME, YarnConfiguration.DEFAULT_APPLICATION_NAME);
    DAG dag = DAG.create(jobName);
    LOG.info("Number of stages: " + stageConfs.length);
    List<TaskLocationHint> mapInputLocations = getMapLocationHintsFromInputSplits(jobId, fs, stageConfs[0], jobSubmitDir);
    List<TaskLocationHint> reduceInputLocations = null;
    Vertex[] vertices = new Vertex[stageConfs.length];
    for (int i = 0; i < stageConfs.length; i++) {
        vertices[i] = createVertexForStage(stageConfs[i], jobLocalResources, i == 0 ? mapInputLocations : reduceInputLocations, i, stageConfs.length);
    }
    for (int i = 0; i < vertices.length; i++) {
        dag.addVertex(vertices[i]);
        if (i > 0) {
            // Set edge conf based on Input conf (compression etc properties for MapReduce are
            // w.r.t Outputs - MAP_OUTPUT_COMPRESS for example)
            Map<String, String> partitionerConf = null;
            if (stageConfs[i - 1] != null) {
                partitionerConf = Maps.newHashMap();
                for (Map.Entry<String, String> entry : stageConfs[i - 1]) {
                    partitionerConf.put(entry.getKey(), entry.getValue());
                }
            }
            OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(stageConfs[i - 1].get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS), stageConfs[i - 1].get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS), MRPartitioner.class.getName(), partitionerConf).setFromConfigurationUnfiltered(stageConfs[i - 1]).configureInput().useLegacyInput().done().build();
            Edge edge = Edge.create(vertices[i - 1], vertices[i], edgeConf.createDefaultEdgeProperty());
            dag.addEdge(edge);
        }
    }
    return dag;
}

Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) DAG(org.apache.tez.dag.api.DAG) Map(java.util.Map) TreeMap(java.util.TreeMap) HashMap(java.util.HashMap) Edge(org.apache.tez.dag.api.Edge) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint)

Example 7 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class JoinValidate method createDag.

@VisibleForTesting
DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions) throws IOException {
    DAG dag = DAG.create(getDagName());
    if (getDefaultExecutionContext() != null) {
        dag.setExecutionContext(getDefaultExecutionContext());
    }
    // Configuration for intermediate output - shared by Vertex1 and Vertex2
    // This should only be setting selective keys from the underlying conf. Fix after there's a
    // better mechanism to configure the IOs. The setFromConfiguration call is optional and allows
    // overriding the config options with command line parameters.
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    Vertex lhsVertex = Vertex.create(LHS_INPUT_NAME, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource("lhs", MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, lhs.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    setVertexExecutionContext(lhsVertex, getLhsExecutionContext());
    Vertex rhsVertex = Vertex.create(RHS_INPUT_NAME, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource("rhs", MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, rhs.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    setVertexExecutionContext(rhsVertex, getRhsExecutionContext());
    Vertex joinValidateVertex = Vertex.create("joinvalidate", ProcessorDescriptor.create(JoinValidateProcessor.class.getName()), numPartitions);
    setVertexExecutionContext(joinValidateVertex, getValidateExecutionContext());
    Edge e1 = Edge.create(lhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
    Edge e2 = Edge.create(rhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
    dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(joinValidateVertex).addEdge(e1).addEdge(e2);
    return dag;
}

Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Text(org.apache.hadoop.io.Text) ForwardingProcessor(org.apache.tez.examples.HashJoinExample.ForwardingProcessor) DAG(org.apache.tez.dag.api.DAG) NullWritable(org.apache.hadoop.io.NullWritable) Edge(org.apache.tez.dag.api.Edge) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 8 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class OrderedWordCount method createDAG.

public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException {
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping).generateSplitsInAM(!isGenerateSplitInClient).build();
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    tokenizerVertex.addDataSource(INPUT, dataSource);
    // Use Text key and IntWritable value to bring counts for each word in the same partition
    // The setFromConfiguration call is optional and allows overriding the config options with
    // command line parameters.
    OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    // This vertex will be reading intermediate data via an input edge and writing intermediate data
    // via an output edge.
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions);
    // Use IntWritable key and Text value to bring all words with the same count in the same
    // partition. The data will be ordered by count and words grouped by count. The
    // setFromConfiguration call is optional and allows overriding the config options with
    // command line parameters.
    OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    // Use 1 task to bring all the data in one place for global sorted order. Essentially the number
    // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
    Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(NoOpSorter.class.getName()), 1);
    sorterVertex.addDataSink(OUTPUT, dataSink);
    // No need to add jar containing this class as assumed to be part of the tez jars.
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addVertex(sorterVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, summationEdgeConf.createDefaultEdgeProperty())).addEdge(Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
    return dag;
}

Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 9 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class WordCount method createDAG.

private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions) throws IOException {
    // Create the descriptor that describes the input data to Tez. Using MRInput to read text
    // data from the given input path. The TextInputFormat is used to read the text data.
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build();
    // Create a descriptor that describes the output data to Tez. Using MROoutput to write text
    // data to the given output path. The TextOutputFormat is used to write the text data.
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
    // Create a vertex that reads the data from the data source and tokenizes it using the
    // TokenProcessor. The number of tasks that will do the work for this vertex will be decided
    // using the information provided by the data source descriptor.
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);
    // Create the edge that represents the movement and semantics of data between the producer
    // Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in
    // parallel the tokenized data will be partitioned by word such that a given word goes to the
    // same partition. The counts for the words should be grouped together per word. To achieve this
    // we can use an edge that contains an input/output pair that handles partitioning and grouping
    // of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
    // edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
    // We specify the key, value and partitioner type. Here the key type is Text (for word), the
    // value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
    // object. The edge can be configured by configuring the input, output etc individually without
    // using this helper. The setFromConfiguration call is optional and allows overriding the config
    // options with command line parameters.
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    // Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
    // The number of tasks that do the work of this vertex depends on the number of partitions used
    // to distribute the sum processing. In this case, its been made configurable via the
    // numPartitions parameter.
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions).addDataSink(OUTPUT, dataSink);
    // No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise
    // we would have to add the jars for this code as local files to the vertices.
    // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
    DAG dag = DAG.create("WordCount");
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
    return dag;
}

Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 10 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class UnionExample method createDAG.

private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
    DAG dag = DAG.create("UnionExample");
    int numMaps = -1;
    Configuration inputConf = new Configuration(tezConf);
    inputConf.setBoolean("mapred.mapper.new-api", false);
    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
    Configuration outputConf = new Configuration(tezConf);
    outputConf.setBoolean("mapred.reducer.new-api", false);
    outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
    outputConf.set(FileOutputFormat.OUTDIR, outputPath);
    DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
    checkerVertex.addDataSink("union", od);
    Configuration allPartsConf = new Configuration(tezConf);
    DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
    checkerVertex.addDataSink("all-parts", od2);
    Configuration partsConf = new Configuration(tezConf);
    DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
    VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
    unionVertex.addDataSink("parts", od1);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
    return dag;
}

Also used : MRInput(org.apache.tez.mapreduce.input.MRInput) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) VertexGroup(org.apache.tez.dag.api.VertexGroup) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) ConcatenatedMergedKeyValuesInput(org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValuesInput) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Aggregations

OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)13 DAG (org.apache.tez.dag.api.DAG)11 Vertex (org.apache.tez.dag.api.Vertex)11 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)10 Configuration (org.apache.hadoop.conf.Configuration)8 Text (org.apache.hadoop.io.Text)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)6 HashPartitioner (org.apache.tez.runtime.library.partitioner.HashPartitioner)6 IntWritable (org.apache.hadoop.io.IntWritable)5 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)5 TextInputFormat (org.apache.hadoop.mapreduce.lib.input.TextInputFormat)4 ArrayList (java.util.ArrayList)3 NullWritable (org.apache.hadoop.io.NullWritable)3 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)3 Edge (org.apache.tez.dag.api.Edge)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 HashMap (java.util.HashMap)2 Path (org.apache.hadoop.fs.Path)2 EdgeType (org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType)2 DataOutputBuffer (org.apache.hadoop.io.DataOutputBuffer)2