Search in sources :

Example 11 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class SortMergeJoinExample method createDag.

/**
 * v1 v2 <br>
 * &nbsp;\&nbsp;/ <br>
 * &nbsp;&nbsp;v3 <br>
 *
 * @param tezConf
 * @param inputPath1
 * @param inputPath2
 * @param outPath
 * @param numPartitions
 * @return dag
 * @throws IOException
 */
private DAG createDag(TezConfiguration tezConf, Path inputPath1, Path inputPath2, Path outPath, int numPartitions) throws IOException {
    DAG dag = DAG.create("SortMergeJoinExample");
    /**
     * This vertex represents the one side of the join. It reads text data using
     * the TextInputFormat. ForwardingProcessor simply forwards the data
     * downstream as is.
     */
    Vertex inputVertex1 = Vertex.create("input1", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    /**
     * The other vertex represents the other side of the join. It reads text
     * data using the TextInputFormat. ForwardingProcessor simply forwards the
     * data downstream as is.
     */
    Vertex inputVertex2 = Vertex.create("input2", ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    /**
     * This vertex represents the join operation. It writes the join output as
     * text using the TextOutputFormat. The JoinProcessor is going to perform
     * the join of the two sorted output from inputVertex1 and inputVerex2. It
     * is load balanced across numPartitions.
     */
    Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(SortMergeJoinProcessor.class.getName()), numPartitions).setVertexManagerPlugin(ShuffleVertexManager.createConfigBuilder(tezConf).setAutoReduceParallelism(true).build()).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
    /**
     * The output of inputVertex1 and inputVertex2 will be partitioned into
     * fragments with the same keys going to the same fragments using hash
     * partitioning. The data to be joined is the key itself and so the value is
     * null. And these outputs will be sorted before feeding them to
     * JoinProcessor. The number of fragments is initially inferred from the
     * number of tasks running in the join vertex because each task will be
     * handling one fragment.
     * Edge config options are derived from client-side tez-site.xml (recommended). Optionally
     * invoke setFromConfiguration to override these config options via commandline arguments.
     */
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    /**
     * Connect the join vertex with inputVertex1 with the EdgeProperty created
     * from {@link OrderedPartitionedKVEdgeConfig} so that the output of
     * inputVertex1 is sorted before feeding it to JoinProcessor
     */
    Edge e1 = Edge.create(inputVertex1, joinVertex, edgeConf.createDefaultEdgeProperty());
    /**
     * Connect the join vertex with inputVertex2 with the EdgeProperty created
     * from {@link OrderedPartitionedKVEdgeConfig} so that the output of
     * inputVertex1 is sorted before feeding it to JoinProcessor
     */
    Edge e2 = Edge.create(inputVertex2, joinVertex, edgeConf.createDefaultEdgeProperty());
    dag.addVertex(inputVertex1).addVertex(inputVertex2).addVertex(joinVertex).addEdge(e1).addEdge(e2);
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) ForwardingProcessor(org.apache.tez.examples.HashJoinExample.ForwardingProcessor) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) NullWritable(org.apache.hadoop.io.NullWritable) Edge(org.apache.tez.dag.api.Edge)

Example 12 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class TestTezJobs method testVertexFailuresMaxPercent.

@Test(timeout = 60000)
public void testVertexFailuresMaxPercent() throws TezException, InterruptedException, IOException {
    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_VERTEX_FAILURES_MAXPERCENT, "50.0f");
    tezConf.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1);
    TezClient tezClient = TezClient.create("TestVertexFailuresMaxPercent", tezConf);
    tezClient.start();
    try {
        DAG dag = DAG.create("TestVertexFailuresMaxPercent");
        Vertex vertex1 = Vertex.create("Parent", ProcessorDescriptor.create(FailingAttemptProcessor.class.getName()), 2);
        Vertex vertex2 = Vertex.create("Child", ProcessorDescriptor.create(FailingAttemptProcessor.class.getName()), 2);
        OrderedPartitionedKVEdgeConfig edgeConfig = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
        dag.addVertex(vertex1).addVertex(vertex2).addEdge(Edge.create(vertex1, vertex2, edgeConfig.createDefaultEdgeProperty()));
        DAGClient dagClient = tezClient.submitDAG(dag);
        dagClient.waitForCompletion();
        Assert.assertEquals(DAGStatus.State.SUCCEEDED, dagClient.getDAGStatus(null).getState());
    } finally {
        tezClient.stop();
    }
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) DAGClient(org.apache.tez.dag.api.client.DAGClient) Text(org.apache.hadoop.io.Text) MultiAttemptDAG(org.apache.tez.test.dag.MultiAttemptDAG) DAG(org.apache.tez.dag.api.DAG) IntWritable(org.apache.hadoop.io.IntWritable) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezClient(org.apache.tez.client.TezClient) Test(org.junit.Test)

Example 13 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class TestHistoryParser method runWordCount.

private String runWordCount(String tokenizerProcessor, String summationProcessor, String dagName, boolean withTimeline) throws Exception {
    // HDFS path
    Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, TextInputFormat.class, inputLoc.toString()).build();
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(tokenizerProcessor)).addDataSource(INPUT, dataSource);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);
    // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
    TezClient tezClient = getTezClient(withTimeline);
    // Update Caller Context
    CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
    ApplicationId appId = tezClient.getAppMasterApplicationId();
    if (appId == null) {
        appId = ApplicationId.newInstance(1001l, 1);
    }
    callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
    dag.setCallerContext(callerContext);
    DAGClient client = tezClient.submitDAG(dag);
    client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
    TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);
    if (tezClient != null) {
        tezClient.stop();
    }
    return tezDAGID.toString();
}
Also used : Path(org.apache.hadoop.fs.Path) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) CallerContext(org.apache.tez.client.CallerContext) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) TezClient(org.apache.tez.client.TezClient) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGClient(org.apache.tez.dag.api.client.DAGClient) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Aggregations

OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)13 DAG (org.apache.tez.dag.api.DAG)11 Vertex (org.apache.tez.dag.api.Vertex)11 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)10 Configuration (org.apache.hadoop.conf.Configuration)8 Text (org.apache.hadoop.io.Text)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)6 HashPartitioner (org.apache.tez.runtime.library.partitioner.HashPartitioner)6 IntWritable (org.apache.hadoop.io.IntWritable)5 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)5 TextInputFormat (org.apache.hadoop.mapreduce.lib.input.TextInputFormat)4 ArrayList (java.util.ArrayList)3 NullWritable (org.apache.hadoop.io.NullWritable)3 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)3 Edge (org.apache.tez.dag.api.Edge)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 HashMap (java.util.HashMap)2 Path (org.apache.hadoop.fs.Path)2 EdgeType (org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType)2 DataOutputBuffer (org.apache.hadoop.io.DataOutputBuffer)2