Search in sources :

Example 26 with EdgeProperty

use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.

the class OrderedPartitionedKVEdgeConfig method createDefaultEdgeProperty.

/**
 * This is a convenience method for the typical usage of this edge, and creates an instance of
 * {@link org.apache.tez.dag.api.EdgeProperty} which is likely to be used. </p>
 * * In this case - DataMovementType.SCATTER_GATHER, EdgeProperty.DataSourceType.PERSISTED,
 * EdgeProperty.SchedulingType.SEQUENTIAL
 *
 * @return an {@link org.apache.tez.dag.api.EdgeProperty} instance
 */
public EdgeProperty createDefaultEdgeProperty() {
    EdgeProperty edgeProperty = EdgeProperty.create(EdgeProperty.DataMovementType.SCATTER_GATHER, EdgeProperty.DataSourceType.PERSISTED, EdgeProperty.SchedulingType.SEQUENTIAL, OutputDescriptor.create(getOutputClassName()).setUserPayload(getOutputPayload()), InputDescriptor.create(getInputClassName()).setUserPayload(getInputPayload()));
    Utils.setEdgePropertyHistoryText(this, edgeProperty);
    return edgeProperty;
}
Also used : EdgeProperty(org.apache.tez.dag.api.EdgeProperty)

Example 27 with EdgeProperty

use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.

the class OrderedPartitionedKVEdgeConfig method createDefaultCustomEdgeProperty.

/**
 * This is a convenience method for creating an Edge descriptor based on the specified
 * EdgeManagerDescriptor.
 *
 * @param edgeManagerDescriptor the custom edge specification
 * @return an {@link org.apache.tez.dag.api.EdgeProperty} instance
 */
public EdgeProperty createDefaultCustomEdgeProperty(EdgeManagerPluginDescriptor edgeManagerDescriptor) {
    Preconditions.checkNotNull(edgeManagerDescriptor, "EdgeManagerDescriptor cannot be null");
    EdgeProperty edgeProperty = EdgeProperty.create(edgeManagerDescriptor, EdgeProperty.DataSourceType.PERSISTED, EdgeProperty.SchedulingType.SEQUENTIAL, OutputDescriptor.create(getOutputClassName()).setUserPayload(getOutputPayload()), InputDescriptor.create(getInputClassName()).setUserPayload(getInputPayload()));
    Utils.setEdgePropertyHistoryText(this, edgeProperty);
    return edgeProperty;
}
Also used : EdgeProperty(org.apache.tez.dag.api.EdgeProperty)

Example 28 with EdgeProperty

use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.

the class CartesianProductVertexManager method initialize.

@Override
public void initialize() throws Exception {
    CartesianProductConfigProto config = CartesianProductConfigProto.parseFrom(ByteString.copyFrom(getContext().getUserPayload().getPayload()));
    // check whether DAG and config are is consistent
    Map<String, EdgeProperty> edgePropertyMap = getContext().getInputVertexEdgeProperties();
    Set<String> sourceVerticesDAG = edgePropertyMap.keySet();
    Set<String> sourceVerticesConfig = new HashSet<>(config.getSourcesList());
    Map<String, List<String>> vertexGroups = getContext().getInputVertexGroups();
    Map<String, String> vertexToGroup = new HashMap<>();
    for (Map.Entry<String, List<String>> group : vertexGroups.entrySet()) {
        for (String vertex : group.getValue()) {
            vertexToGroup.put(vertex, group.getKey());
        }
    }
    for (Map.Entry<String, EdgeProperty> entry : edgePropertyMap.entrySet()) {
        String vertex = entry.getKey();
        String group = vertexToGroup.get(vertex);
        EdgeProperty edgeProperty = entry.getValue();
        EdgeManagerPluginDescriptor empDescriptor = edgeProperty.getEdgeManagerDescriptor();
        if (empDescriptor != null && empDescriptor.getClassName().equals(CartesianProductEdgeManager.class.getName())) {
            Preconditions.checkArgument(sourceVerticesConfig.contains(vertex) || sourceVerticesConfig.contains(group), vertex + " has CartesianProductEdgeManager but isn't in " + "CartesianProductVertexManagerConfig");
        } else {
            Preconditions.checkArgument(!sourceVerticesConfig.contains(vertex) && !sourceVerticesConfig.contains(group), vertex + " has no CartesianProductEdgeManager but is in " + "CartesianProductVertexManagerConfig");
        }
        if (edgeProperty.getDataMovementType() == CUSTOM) {
            Preconditions.checkArgument(sourceVerticesConfig.contains(vertex) || sourceVerticesConfig.contains(group), "Only broadcast and cartesian product edges are allowed in cartesian product vertex");
        } else {
            Preconditions.checkArgument(edgeProperty.getDataMovementType() == BROADCAST, "Only broadcast and cartesian product edges are allowed in cartesian product vertex");
        }
    }
    for (String src : sourceVerticesConfig) {
        List<String> vertices = vertexGroups.containsKey(src) ? vertexGroups.get(src) : Collections.singletonList(src);
        for (String v : vertices) {
            Preconditions.checkArgument(sourceVerticesDAG.contains(v), v + " is in CartesianProductVertexManagerConfig but not a source vertex in DAG");
            Preconditions.checkArgument(edgePropertyMap.get(v).getEdgeManagerDescriptor().getClassName().equals(CartesianProductEdgeManager.class.getName()), v + " is in CartesianProductVertexManagerConfig and a source vertex, but has no " + "CartesianProductEdgeManager");
        }
    }
    vertexManagerReal = config.getIsPartitioned() ? new CartesianProductVertexManagerPartitioned(getContext()) : new FairCartesianProductVertexManager(getContext());
    vertexManagerReal.initialize(config);
}
Also used : HashMap(java.util.HashMap) ByteString(com.google.protobuf.ByteString) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 29 with EdgeProperty

use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.

the class TestHistoryEventsProtoConversion method testVertexReconfigureDoneEvent.

private void testVertexReconfigureDoneEvent() throws Exception {
    VertexLocationHint vertexLocationHint = VertexLocationHint.create(new ArrayList<TaskLocationHint>());
    InputSpecUpdate rootInputSpecUpdateBulk = InputSpecUpdate.createAllTaskInputSpecUpdate(2);
    InputSpecUpdate rootInputSpecUpdatePerTask = InputSpecUpdate.createPerTaskInputSpecUpdate(Lists.newArrayList(1, 2, 3));
    Map<String, InputSpecUpdate> rootInputSpecUpdates = new HashMap<String, InputSpecUpdate>();
    rootInputSpecUpdates.put("input1", rootInputSpecUpdateBulk);
    rootInputSpecUpdates.put("input2", rootInputSpecUpdatePerTask);
    Map<String, EdgeProperty> sourceEdgeManagers = new HashMap<String, EdgeProperty>();
    // add standard and custom edge
    sourceEdgeManagers.put("foo", EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("Out1"), InputDescriptor.create("in1")));
    sourceEdgeManagers.put("foo1", EdgeProperty.create(EdgeManagerPluginDescriptor.create("bar1").setUserPayload(UserPayload.create(ByteBuffer.wrap(new String("payload").getBytes()), 100)), DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("Out1"), InputDescriptor.create("in1")));
    final long reconfigureDoneTime = 100;
    final int numTasks = 2;
    VertexConfigurationDoneEvent event = new VertexConfigurationDoneEvent(TezVertexID.getInstance(TezDAGID.getInstance(ApplicationId.newInstance(0, 1), 1), 111), reconfigureDoneTime, numTasks, vertexLocationHint, sourceEdgeManagers, rootInputSpecUpdates, true);
    Assert.assertEquals(numTasks, event.getNumTasks());
    Assert.assertEquals(reconfigureDoneTime, event.getReconfigureDoneTime());
    VertexConfigurationDoneEvent deserializedEvent = (VertexConfigurationDoneEvent) testProtoConversion(event);
    Assert.assertEquals(event.getVertexID(), deserializedEvent.getVertexID());
    Assert.assertEquals(event.getNumTasks(), deserializedEvent.getNumTasks());
    Assert.assertEquals(event.isSetParallelismCalled(), deserializedEvent.isSetParallelismCalled());
    // vertexLocationHint
    Assert.assertEquals(event.getVertexLocationHint(), deserializedEvent.getVertexLocationHint());
    // rootInputSpec
    Assert.assertEquals(event.getRootInputSpecUpdates().size(), deserializedEvent.getRootInputSpecUpdates().size());
    InputSpecUpdate deserializedBulk = deserializedEvent.getRootInputSpecUpdates().get("input1");
    InputSpecUpdate deserializedPerTask = deserializedEvent.getRootInputSpecUpdates().get("input2");
    Assert.assertEquals(rootInputSpecUpdateBulk.isForAllWorkUnits(), deserializedBulk.isForAllWorkUnits());
    Assert.assertEquals(rootInputSpecUpdateBulk.getAllNumPhysicalInputs(), deserializedBulk.getAllNumPhysicalInputs());
    Assert.assertEquals(rootInputSpecUpdatePerTask.isForAllWorkUnits(), deserializedPerTask.isForAllWorkUnits());
    Assert.assertEquals(rootInputSpecUpdatePerTask.getAllNumPhysicalInputs(), deserializedPerTask.getAllNumPhysicalInputs());
    // sourceEdgeManager
    Assert.assertEquals(event.getSourceEdgeProperties().size(), deserializedEvent.getSourceEdgeProperties().size());
    Assert.assertEquals(event.getSourceEdgeProperties().get("foo").getDataMovementType(), deserializedEvent.getSourceEdgeProperties().get("foo").getDataMovementType());
    Assert.assertNull(deserializedEvent.getSourceEdgeProperties().get("foo").getEdgeManagerDescriptor());
    Assert.assertEquals(event.getSourceEdgeProperties().get("foo1").getDataMovementType(), deserializedEvent.getSourceEdgeProperties().get("foo1").getDataMovementType());
    Assert.assertEquals(event.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().getVersion(), deserializedEvent.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().getVersion());
    Assert.assertArrayEquals(event.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().deepCopyAsArray(), deserializedEvent.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().deepCopyAsArray());
    logEvents(event, deserializedEvent);
}
Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) HashMap(java.util.HashMap) InputSpecUpdate(org.apache.tez.runtime.api.InputSpecUpdate) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint)

Example 30 with EdgeProperty

use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.

the class HashJoinExample method createDag.

private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions, boolean doBroadcast) throws IOException {
    DAG dag = DAG.create("HashJoinExample" + (doBroadcast ? "-WithBroadcast" : ""));
    /**
     * This vertex represents the side of the join that will be accumulated in a
     * hash table in order to join it against the other side. It reads text data
     * using the TextInputFormat. ForwardingProcessor simply forwards the data
     * downstream as is.
     */
    Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, hashPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    /**
     * This vertex represents that side of the data that will be streamed and
     * joined against the other side that has been accumulated into a hash
     * table. It reads text data using the TextInputFormat. ForwardingProcessor
     * simply forwards the data downstream as is.
     */
    Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, streamPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
    /**
     * This vertex represents the join operation. It writes the join output as
     * text using the TextOutputFormat. The JoinProcessor is going to perform
     * the join of the streaming side and the hash side. It is load balanced
     * across numPartitions
     */
    Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(HashJoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
    /**
     * The streamed side will be partitioned into fragments with the same keys
     * going to the same fragments using hash partitioning. The data to be
     * joined is the key itself and so the value is null. The number of
     * fragments is initially inferred from the number of tasks running in the
     * join vertex because each task will be handling one fragment. The
     * setFromConfiguration call is optional and allows overriding the config
     * options with command line parameters.
     */
    UnorderedPartitionedKVEdgeConfig streamConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    /**
     * Connect the join vertex with the stream side
     */
    Edge e1 = Edge.create(streamFileVertex, joinVertex, streamConf.createDefaultEdgeProperty());
    EdgeProperty hashSideEdgeProperty = null;
    if (doBroadcast) {
        /**
         * This option can be used when the hash side is small. We can broadcast
         * the entire data to all fragments of the stream side. This avoids
         * re-partitioning the fragments of the stream side to match the
         * partitioning scheme of the hash side and avoids costly network data
         * transfer. However, in this example the stream side is being partitioned
         * in both cases for brevity of code. The join task can perform the join
         * of its fragment of keys with all the keys of the hash side. Using an
         * unpartitioned edge to transfer the complete output of the hash side to
         * be broadcasted to all fragments of the streamed side. Again, since the
         * data is the key, the value is null. The setFromConfiguration call is
         * optional and allows overriding the config options with command line
         * parameters.
         */
        UnorderedKVEdgeConfig broadcastConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName()).setFromConfiguration(tezConf).build();
        hashSideEdgeProperty = broadcastConf.createDefaultBroadcastEdgeProperty();
    } else {
        /**
         * The hash side is also being partitioned into fragments with the same
         * key going to the same fragment using hash partitioning. This way all
         * keys with the same hash value will go to the same fragment from both
         * sides. Thus the join task handling that fragment can join both data set
         * fragments.
         */
        hashSideEdgeProperty = streamConf.createDefaultEdgeProperty();
    }
    /**
     * Connect the join vertex to the hash side. The join vertex is connected
     * with 2 upstream vertices that provide it with inputs
     */
    Edge e2 = Edge.create(hashFileVertex, joinVertex, hashSideEdgeProperty);
    /**
     * Connect everything up by adding them to the DAG
     */
    dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(joinVertex).addEdge(e1).addEdge(e2);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) UnorderedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) NullWritable(org.apache.hadoop.io.NullWritable) Edge(org.apache.tez.dag.api.Edge)

Aggregations

EdgeProperty (org.apache.tez.dag.api.EdgeProperty)62 Test (org.junit.Test)31 HashMap (java.util.HashMap)28 ByteString (com.google.protobuf.ByteString)19 VertexStateUpdate (org.apache.tez.dag.api.event.VertexStateUpdate)19 EdgeManagerPluginDescriptor (org.apache.tez.dag.api.EdgeManagerPluginDescriptor)16 VertexManagerPluginContext (org.apache.tez.dag.api.VertexManagerPluginContext)15 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)14 Configuration (org.apache.hadoop.conf.Configuration)13 Map (java.util.Map)9 EdgeManagerForTest (org.apache.tez.test.EdgeManagerForTest)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)6 StateChangeNotifierForTest (org.apache.tez.dag.app.dag.TestStateChangeNotifier.StateChangeNotifierForTest)6 Vertex (org.apache.tez.dag.app.dag.Vertex)6 UserPayload (org.apache.tez.dag.api.UserPayload)5 TaskAttemptIdentifier (org.apache.tez.runtime.api.TaskAttemptIdentifier)5 GraceShuffleVertexManagerForTest (org.apache.tez.test.GraceShuffleVertexManagerForTest)5 VertexManagerPluginForTest (org.apache.tez.test.VertexManagerPluginForTest)5 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)4 Vertex (org.apache.tez.dag.api.Vertex)4