Search in sources :

Example 6 with CartesianProductConfig

use of org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig in project tez by apache.

the class CartesianProduct method createDAG.

private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2, String inputPath3, String outputPath, boolean isPartitioned) throws IOException {
    Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    // turn off groupSplit so that each input file incurs one task
    v1.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1).groupSplits(false).build());
    Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v2.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2).groupSplits(false).build());
    Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v3.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3).groupSplits(false).build());
    CartesianProductConfig cartesianProductConfig;
    if (isPartitioned) {
        Map<String, Integer> vertexPartitionMap = new HashMap<>();
        for (String vertex : cpSources) {
            vertexPartitionMap.put(vertex, numPartition);
        }
        cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap);
    } else {
        cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources));
    }
    UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
    Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName()));
    v4.addDataSink(OUTPUT, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build());
    v4.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
    EdgeManagerPluginDescriptor cpEdgeManager = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
    cpEdgeManager.setUserPayload(userPayload);
    EdgeProperty cpEdgeProperty;
    if (isPartitioned) {
        UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), CustomPartitioner.class.getName()).build();
        cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
    } else {
        UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
        cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
    }
    EdgeProperty broadcastEdgeProperty;
    UnorderedKVEdgeConfig broadcastEdgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
    broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty();
    return DAG.create("CartesianProduct").addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4).addEdge(Edge.create(v1, v4, cpEdgeProperty)).addEdge(Edge.create(v2, v4, cpEdgeProperty)).addEdge(Edge.create(v3, v4, broadcastEdgeProperty));
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) UserPayload(org.apache.tez.dag.api.UserPayload) HashMap(java.util.HashMap) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) UnorderedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) CartesianProductEdgeManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig)

Aggregations

CartesianProductConfig (org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig)6 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)5 Vertex (org.apache.tez.dag.api.Vertex)5 CartesianProductVertexManager (org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager)5 EdgeManagerPluginDescriptor (org.apache.tez.dag.api.EdgeManagerPluginDescriptor)4 ArrayList (java.util.ArrayList)3 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)3 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)3 EdgeProperty (org.apache.tez.dag.api.EdgeProperty)3 UserPayload (org.apache.tez.dag.api.UserPayload)3 CartesianProductEdgeManager (org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager)3 UnorderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig)3 Configuration (org.apache.hadoop.conf.Configuration)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)2 MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)2 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)2 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)2 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)2 StatsPublisher (org.apache.hadoop.hive.ql.stats.StatsPublisher)2