Search in sources :

Example 21 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutputLegacy method testOldAPI_MR.

// simulate the behavior of translating MR to DAG using MR old API
@Test(timeout = 5000)
public void testOldAPI_MR() throws Exception {
    String outputPath = "/tmp/output";
    JobConf conf = new JobConf();
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat.class);
    org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));
    // the output is attached to reducer
    conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
    UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(conf);
    OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()).setUserPayload(vertexPayload);
    DataSinkDescriptor sink = DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);
    OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
    MROutputLegacy output = new MROutputLegacy(outputContext, 2);
    output.initialize();
    assertEquals(false, output.useNewApi);
    assertEquals(org.apache.hadoop.mapred.SequenceFileOutputFormat.class, output.oldOutputFormat.getClass());
    assertNull(output.newOutputFormat);
    assertEquals(NullWritable.class, output.oldApiTaskAttemptContext.getOutputKeyClass());
    assertEquals(Text.class, output.oldApiTaskAttemptContext.getOutputValueClass());
    assertNull(output.newApiTaskAttemptContext);
    assertNotNull(output.oldRecordWriter);
    assertNull(output.newRecordWriter);
    assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
Also used : Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) JobConf(org.apache.hadoop.mapred.JobConf) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Example 22 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutputLegacy method testOldAPI_MapperOnly.

// simulate the behavior of translating Mapper-only job to DAG using MR old API
@Test(timeout = 5000)
public void testOldAPI_MapperOnly() throws Exception {
    String outputPath = "/tmp/output";
    JobConf conf = new JobConf();
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat.class);
    org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));
    // the output is attached to mapper
    conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
    UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(conf);
    OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()).setUserPayload(vertexPayload);
    DataSinkDescriptor sink = DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);
    OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
    MROutputLegacy output = new MROutputLegacy(outputContext, 2);
    output.initialize();
    assertEquals(false, output.useNewApi);
    assertEquals(org.apache.hadoop.mapred.SequenceFileOutputFormat.class, output.oldOutputFormat.getClass());
    assertNull(output.newOutputFormat);
    assertEquals(NullWritable.class, output.oldApiTaskAttemptContext.getOutputKeyClass());
    assertEquals(Text.class, output.oldApiTaskAttemptContext.getOutputValueClass());
    assertNull(output.newApiTaskAttemptContext);
    assertNotNull(output.oldRecordWriter);
    assertNull(output.newRecordWriter);
    assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
Also used : Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) JobConf(org.apache.hadoop.mapred.JobConf) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Example 23 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestHistoryParser method runWordCount.

private String runWordCount(String tokenizerProcessor, String summationProcessor, String dagName, boolean withTimeline) throws Exception {
    // HDFS path
    Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, TextInputFormat.class, inputLoc.toString()).build();
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(tokenizerProcessor)).addDataSource(INPUT, dataSource);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);
    // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
    TezClient tezClient = getTezClient(withTimeline);
    // Update Caller Context
    CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
    ApplicationId appId = tezClient.getAppMasterApplicationId();
    if (appId == null) {
        appId = ApplicationId.newInstance(1001l, 1);
    }
    callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
    dag.setCallerContext(callerContext);
    DAGClient client = tezClient.submitDAG(dag);
    client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
    TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);
    if (tezClient != null) {
        tezClient.stop();
    }
    return tezDAGID.toString();
}
Also used : Path(org.apache.hadoop.fs.Path) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) CallerContext(org.apache.tez.client.CallerContext) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) TezClient(org.apache.tez.client.TezClient) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGClient(org.apache.tez.dag.api.client.DAGClient) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Aggregations

DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)23 Vertex (org.apache.tez.dag.api.Vertex)12 OutputContext (org.apache.tez.runtime.api.OutputContext)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)8 Path (org.apache.hadoop.fs.Path)7 DAG (org.apache.tez.dag.api.DAG)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 UserPayload (org.apache.tez.dag.api.UserPayload)6 JobConf (org.apache.hadoop.mapred.JobConf)5 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)5 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)3 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)3