Search in sources :

Example 1 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project hive by apache.

the class DagUtils method createVertex.

/**
 * Create a vertex from a given work object.
 *
 * @param conf JobConf to be used to this execution unit
 * @param work The instance of BaseWork representing the actual work to be performed
 * by this vertex.
 * @param scratchDir HDFS scratch dir for this execution unit.
 * @param fileSystem FS corresponding to scratchDir and LocalResources
 * @param ctx This query's context
 * @return Vertex
 */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, fileSystem, scratchDir, ctx, localResources);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
        // set VertexManagerPlugin if whether it's a cross product destination vertex
        List<String> crossProductSources = new ArrayList<>();
        for (BaseWork parentWork : tezWork.getParents(work)) {
            if (tezWork.getEdgeType(parentWork, work) == EdgeType.XPROD_EDGE) {
                crossProductSources.add(parentWork.getName());
            }
        }
        if (!crossProductSources.isEmpty()) {
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            v.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
        // parallelism shouldn't be set for cartesian product vertex
        }
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}
Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 2 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class MultipleCommitsExample method createDAG.

private DAG createDAG(TezConfiguration tezConf, String v1OutputPathPrefix, int v1OutputNum, String v2OutputPathPrefix, int v2OutputNum, String uv12OutputPathPrefix, int uv12OutputNum, String v3OutputPathPrefix, int v3OutputNum, boolean commitOnVertexSuccess) throws IOException {
    DAG dag = DAG.create("multipleCommitsDAG");
    dag.setConf(TezConfiguration.TEZ_AM_COMMIT_ALL_OUTPUTS_ON_DAG_SUCCESS, !commitOnVertexSuccess + "");
    Vertex v1 = Vertex.create("v1", ProcessorDescriptor.create(MultipleOutputProcessor.class.getName()).setUserPayload(new MultipleOutputProcessor.MultipleOutputProcessorConfig(V1OutputNamePrefix, v1OutputNum, UV12OutputNamePrefix, uv12OutputNum).toUserPayload()), 2);
    Vertex v2 = Vertex.create("v2", ProcessorDescriptor.create(MultipleOutputProcessor.class.getName()).setUserPayload(new MultipleOutputProcessor.MultipleOutputProcessorConfig(V2OutputNamePrefix, v2OutputNum, UV12OutputNamePrefix, uv12OutputNum).toUserPayload()), 2);
    // add data sinks for v1
    for (int i = 0; i < v1OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, v1OutputPathPrefix + "_" + i).build();
        v1.addDataSink(V1OutputNamePrefix + "_" + i, sink);
    }
    // add data sinks for v2
    for (int i = 0; i < v2OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, v2OutputPathPrefix + "_" + i).build();
        v2.addDataSink(V2OutputNamePrefix + "_" + i, sink);
    }
    // add data sinks for (v1,v2)
    VertexGroup uv12 = dag.createVertexGroup("uv12", v1, v2);
    for (int i = 0; i < uv12OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, uv12OutputPathPrefix + "_" + i).build();
        uv12.addDataSink(UV12OutputNamePrefix + "_" + i, sink);
    }
    Vertex v3 = Vertex.create("v3", ProcessorDescriptor.create(MultipleOutputProcessor.class.getName()).setUserPayload(new MultipleOutputProcessor.MultipleOutputProcessorConfig(V3OutputNamePrefix, v3OutputNum).toUserPayload()), 2);
    // add data sinks for v3
    for (int i = 0; i < v3OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, v3OutputPathPrefix + "_" + i).build();
        v3.addDataSink(V3OutputNamePrefix + "_" + i, sink);
    }
    OrderedPartitionedKVEdgeConfig edgeConfig = OrderedPartitionedKVEdgeConfig.newBuilder(NullWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    GroupInputEdge edge = GroupInputEdge.create(uv12, v3, edgeConfig.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName()));
    dag.addVertex(v1).addVertex(v2).addVertex(v3).addEdge(edge);
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) NullWritable(org.apache.hadoop.io.NullWritable) VertexGroup(org.apache.tez.dag.api.VertexGroup) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Example 3 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutput method testOldAPI_WorkOutputPathOutputFormat.

// test to try and use the WorkOutputPathOutputFormat - this checks that the workOutput path is
// set while creating recordWriters
@Test(timeout = 5000)
public void testOldAPI_WorkOutputPathOutputFormat() throws Exception {
    String outputPath = "/tmp/output";
    Configuration conf = new Configuration();
    conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, OldAPI_WorkOutputPathReadingOutputFormat.class, outputPath).build();
    OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
    MROutput output = new MROutput(outputContext, 2);
    output.initialize();
    assertEquals(false, output.isMapperOutput);
    assertEquals(false, output.useNewApi);
    assertEquals(OldAPI_WorkOutputPathReadingOutputFormat.class, output.oldOutputFormat.getClass());
    assertNull(output.newOutputFormat);
    assertNotNull(output.oldApiTaskAttemptContext);
    assertNull(output.newApiTaskAttemptContext);
    assertNotNull(output.oldRecordWriter);
    assertNull(output.newRecordWriter);
    assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Example 4 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutput method testOldAPI_TextOutputFormat.

@Test(timeout = 5000)
public void testOldAPI_TextOutputFormat() throws Exception {
    String outputPath = "/tmp/output";
    Configuration conf = new Configuration();
    conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, org.apache.hadoop.mapred.TextOutputFormat.class, outputPath).build();
    OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
    MROutput output = new MROutput(outputContext, 2);
    output.initialize();
    assertEquals(false, output.isMapperOutput);
    assertEquals(false, output.useNewApi);
    assertEquals(org.apache.hadoop.mapred.TextOutputFormat.class, output.oldOutputFormat.getClass());
    assertNull(output.newOutputFormat);
    assertNotNull(output.oldApiTaskAttemptContext);
    assertNull(output.newApiTaskAttemptContext);
    assertNotNull(output.oldRecordWriter);
    assertNull(output.newRecordWriter);
    assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Example 5 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMROutput method testNewAPI_WorkOutputPathOutputFormat.

// test to try and use the WorkOutputPathOutputFormat - this checks that the getDefaultWorkFile is
// set while creating recordWriters
@Test(timeout = 5000)
public void testNewAPI_WorkOutputPathOutputFormat() throws Exception {
    String outputPath = "/tmp/output";
    Configuration conf = new Configuration();
    conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, NewAPI_WorkOutputPathReadingOutputFormat.class, outputPath).build();
    OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
    MROutput output = new MROutput(outputContext, 2);
    output.initialize();
    assertEquals(true, output.isMapperOutput);
    assertEquals(true, output.useNewApi);
    assertEquals(NewAPI_WorkOutputPathReadingOutputFormat.class, output.newOutputFormat.getClass());
    assertNull(output.oldOutputFormat);
    assertNotNull(output.newApiTaskAttemptContext);
    assertNull(output.oldApiTaskAttemptContext);
    assertNotNull(output.newRecordWriter);
    assertNull(output.oldRecordWriter);
    assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) OutputContext(org.apache.tez.runtime.api.OutputContext) Test(org.junit.Test)

Aggregations

DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)23 Vertex (org.apache.tez.dag.api.Vertex)12 OutputContext (org.apache.tez.runtime.api.OutputContext)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)8 Path (org.apache.hadoop.fs.Path)7 DAG (org.apache.tez.dag.api.DAG)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 UserPayload (org.apache.tez.dag.api.UserPayload)6 JobConf (org.apache.hadoop.mapred.JobConf)5 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)5 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)3 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)3