Search in sources :

Example 6 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutputLegacy method testNewAPI_MR.

// simulate the behavior of translating MR to DAG using MR new API
@Test(timeout = 5000)
public void testNewAPI_MR() throws Exception {
    String outputPath = "/tmp/output";
    Job job = Job.getInstance();
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.getConfiguration().setBoolean("mapred.reducer.new-api", true);
    // the output is attached to reducer
    job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
    UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
    OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()).setUserPayload(vertexPayload);
    DataSinkDescriptor sink = DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);
    OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
    MROutputLegacy output = new MROutputLegacy(outputContext, 2);
    output.initialize();
    assertEquals(true, output.useNewApi);
    assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
    assertNull(output.oldOutputFormat);
    assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
    assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
    assertNull(output.oldApiTaskAttemptContext);
    assertNotNull(output.newRecordWriter);
    assertNull(output.oldRecordWriter);
    assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
Also used : Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) Job(org.apache.hadoop.mapreduce.Job) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Example 7 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutputLegacy method testNewAPI_MapperOnly.

// simulate the behavior of translating mapper-only job to DAG using MR new API
@Test(timeout = 5000)
public void testNewAPI_MapperOnly() throws Exception {
    String outputPath = "/tmp/output";
    Job job = Job.getInstance();
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.getConfiguration().setBoolean("mapred.mapper.new-api", true);
    // the output is attached to mapper
    job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
    UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
    OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()).setUserPayload(vertexPayload);
    DataSinkDescriptor sink = DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);
    OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
    MROutputLegacy output = new MROutputLegacy(outputContext, 2);
    output.initialize();
    assertEquals(true, output.useNewApi);
    assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
    assertNull(output.oldOutputFormat);
    assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
    assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
    assertNull(output.oldApiTaskAttemptContext);
    assertNotNull(output.newRecordWriter);
    assertNull(output.oldRecordWriter);
    assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
Also used : Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) Job(org.apache.hadoop.mapreduce.Job) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Example 8 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMockDAGAppMaster method createDAG.

private DAG createDAG(String dagName, boolean uv12CommitFail, boolean v3CommitFail) {
    DAG dag = DAG.create(dagName);
    Vertex v1 = Vertex.create("v1", ProcessorDescriptor.create("Proc"), 1);
    Vertex v2 = Vertex.create("v2", ProcessorDescriptor.create("Proc"), 1);
    Vertex v3 = Vertex.create("v3", ProcessorDescriptor.create("Proc"), 1);
    VertexGroup uv12 = dag.createVertexGroup("uv12", v1, v2);
    DataSinkDescriptor uv12DataSink = DataSinkDescriptor.create(OutputDescriptor.create("dummy output"), createOutputCommitterDesc(uv12CommitFail), null);
    uv12.addDataSink("uv12Out", uv12DataSink);
    DataSinkDescriptor v3DataSink = DataSinkDescriptor.create(OutputDescriptor.create("dummy output"), createOutputCommitterDesc(v3CommitFail), null);
    v3.addDataSink("v3Out", v3DataSink);
    GroupInputEdge e1 = GroupInputEdge.create(uv12, v3, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("dummy output class"), InputDescriptor.create("dummy input class")), InputDescriptor.create("merge.class"));
    dag.addVertex(v1).addVertex(v2).addVertex(v3).addEdge(e1);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) VertexGroup(org.apache.tez.dag.api.VertexGroup) DAG(org.apache.tez.dag.api.DAG) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor)

Example 9 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class OrderedWordCount method createDAG.

public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException {
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping).generateSplitsInAM(!isGenerateSplitInClient).build();
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    tokenizerVertex.addDataSource(INPUT, dataSource);
    // Use Text key and IntWritable value to bring counts for each word in the same partition
    // The setFromConfiguration call is optional and allows overriding the config options with
    // command line parameters.
    OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    // This vertex will be reading intermediate data via an input edge and writing intermediate data
    // via an output edge.
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions);
    // Use IntWritable key and Text value to bring all words with the same count in the same
    // partition. The data will be ordered by count and words grouped by count. The
    // setFromConfiguration call is optional and allows overriding the config options with
    // command line parameters.
    OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    // Use 1 task to bring all the data in one place for global sorted order. Essentially the number
    // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
    Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(NoOpSorter.class.getName()), 1);
    sorterVertex.addDataSink(OUTPUT, dataSink);
    // No need to add jar containing this class as assumed to be part of the tez jars.
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addVertex(sorterVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, summationEdgeConf.createDefaultEdgeProperty())).addEdge(Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 10 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class WordCount method createDAG.

private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions) throws IOException {
    // Create the descriptor that describes the input data to Tez. Using MRInput to read text
    // data from the given input path. The TextInputFormat is used to read the text data.
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build();
    // Create a descriptor that describes the output data to Tez. Using MROoutput to write text
    // data to the given output path. The TextOutputFormat is used to write the text data.
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
    // Create a vertex that reads the data from the data source and tokenizes it using the
    // TokenProcessor. The number of tasks that will do the work for this vertex will be decided
    // using the information provided by the data source descriptor.
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);
    // Create the edge that represents the movement and semantics of data between the producer
    // Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in
    // parallel the tokenized data will be partitioned by word such that a given word goes to the
    // same partition. The counts for the words should be grouped together per word. To achieve this
    // we can use an edge that contains an input/output pair that handles partitioning and grouping
    // of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
    // edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
    // We specify the key, value and partitioner type. Here the key type is Text (for word), the
    // value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
    // object. The edge can be configured by configuring the input, output etc individually without
    // using this helper. The setFromConfiguration call is optional and allows overriding the config
    // options with command line parameters.
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    // Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
    // The number of tasks that do the work of this vertex depends on the number of partitions used
    // to distribute the sum processing. In this case, its been made configurable via the
    // numPartitions parameter.
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions).addDataSink(OUTPUT, dataSink);
    // No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise
    // we would have to add the jars for this code as local files to the vertices.
    // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
    DAG dag = DAG.create("WordCount");
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Aggregations

DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)23 Vertex (org.apache.tez.dag.api.Vertex)12 OutputContext (org.apache.tez.runtime.api.OutputContext)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)8 Path (org.apache.hadoop.fs.Path)7 DAG (org.apache.tez.dag.api.DAG)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 UserPayload (org.apache.tez.dag.api.UserPayload)6 JobConf (org.apache.hadoop.mapred.JobConf)5 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)5 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)3 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)3