use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.
the class OrderedPartitionedKVEdgeConfig method createDefaultEdgeProperty.
/**
* This is a convenience method for the typical usage of this edge, and creates an instance of
* {@link org.apache.tez.dag.api.EdgeProperty} which is likely to be used. </p>
* * In this case - DataMovementType.SCATTER_GATHER, EdgeProperty.DataSourceType.PERSISTED,
* EdgeProperty.SchedulingType.SEQUENTIAL
*
* @return an {@link org.apache.tez.dag.api.EdgeProperty} instance
*/
public EdgeProperty createDefaultEdgeProperty() {
EdgeProperty edgeProperty = EdgeProperty.create(EdgeProperty.DataMovementType.SCATTER_GATHER, EdgeProperty.DataSourceType.PERSISTED, EdgeProperty.SchedulingType.SEQUENTIAL, OutputDescriptor.create(getOutputClassName()).setUserPayload(getOutputPayload()), InputDescriptor.create(getInputClassName()).setUserPayload(getInputPayload()));
Utils.setEdgePropertyHistoryText(this, edgeProperty);
return edgeProperty;
}
use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.
the class OrderedPartitionedKVEdgeConfig method createDefaultCustomEdgeProperty.
/**
* This is a convenience method for creating an Edge descriptor based on the specified
* EdgeManagerDescriptor.
*
* @param edgeManagerDescriptor the custom edge specification
* @return an {@link org.apache.tez.dag.api.EdgeProperty} instance
*/
public EdgeProperty createDefaultCustomEdgeProperty(EdgeManagerPluginDescriptor edgeManagerDescriptor) {
Preconditions.checkNotNull(edgeManagerDescriptor, "EdgeManagerDescriptor cannot be null");
EdgeProperty edgeProperty = EdgeProperty.create(edgeManagerDescriptor, EdgeProperty.DataSourceType.PERSISTED, EdgeProperty.SchedulingType.SEQUENTIAL, OutputDescriptor.create(getOutputClassName()).setUserPayload(getOutputPayload()), InputDescriptor.create(getInputClassName()).setUserPayload(getInputPayload()));
Utils.setEdgePropertyHistoryText(this, edgeProperty);
return edgeProperty;
}
use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.
the class CartesianProductVertexManager method initialize.
@Override
public void initialize() throws Exception {
CartesianProductConfigProto config = CartesianProductConfigProto.parseFrom(ByteString.copyFrom(getContext().getUserPayload().getPayload()));
// check whether DAG and config are is consistent
Map<String, EdgeProperty> edgePropertyMap = getContext().getInputVertexEdgeProperties();
Set<String> sourceVerticesDAG = edgePropertyMap.keySet();
Set<String> sourceVerticesConfig = new HashSet<>(config.getSourcesList());
Map<String, List<String>> vertexGroups = getContext().getInputVertexGroups();
Map<String, String> vertexToGroup = new HashMap<>();
for (Map.Entry<String, List<String>> group : vertexGroups.entrySet()) {
for (String vertex : group.getValue()) {
vertexToGroup.put(vertex, group.getKey());
}
}
for (Map.Entry<String, EdgeProperty> entry : edgePropertyMap.entrySet()) {
String vertex = entry.getKey();
String group = vertexToGroup.get(vertex);
EdgeProperty edgeProperty = entry.getValue();
EdgeManagerPluginDescriptor empDescriptor = edgeProperty.getEdgeManagerDescriptor();
if (empDescriptor != null && empDescriptor.getClassName().equals(CartesianProductEdgeManager.class.getName())) {
Preconditions.checkArgument(sourceVerticesConfig.contains(vertex) || sourceVerticesConfig.contains(group), vertex + " has CartesianProductEdgeManager but isn't in " + "CartesianProductVertexManagerConfig");
} else {
Preconditions.checkArgument(!sourceVerticesConfig.contains(vertex) && !sourceVerticesConfig.contains(group), vertex + " has no CartesianProductEdgeManager but is in " + "CartesianProductVertexManagerConfig");
}
if (edgeProperty.getDataMovementType() == CUSTOM) {
Preconditions.checkArgument(sourceVerticesConfig.contains(vertex) || sourceVerticesConfig.contains(group), "Only broadcast and cartesian product edges are allowed in cartesian product vertex");
} else {
Preconditions.checkArgument(edgeProperty.getDataMovementType() == BROADCAST, "Only broadcast and cartesian product edges are allowed in cartesian product vertex");
}
}
for (String src : sourceVerticesConfig) {
List<String> vertices = vertexGroups.containsKey(src) ? vertexGroups.get(src) : Collections.singletonList(src);
for (String v : vertices) {
Preconditions.checkArgument(sourceVerticesDAG.contains(v), v + " is in CartesianProductVertexManagerConfig but not a source vertex in DAG");
Preconditions.checkArgument(edgePropertyMap.get(v).getEdgeManagerDescriptor().getClassName().equals(CartesianProductEdgeManager.class.getName()), v + " is in CartesianProductVertexManagerConfig and a source vertex, but has no " + "CartesianProductEdgeManager");
}
}
vertexManagerReal = config.getIsPartitioned() ? new CartesianProductVertexManagerPartitioned(getContext()) : new FairCartesianProductVertexManager(getContext());
vertexManagerReal.initialize(config);
}
use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.
the class TestHistoryEventsProtoConversion method testVertexReconfigureDoneEvent.
private void testVertexReconfigureDoneEvent() throws Exception {
VertexLocationHint vertexLocationHint = VertexLocationHint.create(new ArrayList<TaskLocationHint>());
InputSpecUpdate rootInputSpecUpdateBulk = InputSpecUpdate.createAllTaskInputSpecUpdate(2);
InputSpecUpdate rootInputSpecUpdatePerTask = InputSpecUpdate.createPerTaskInputSpecUpdate(Lists.newArrayList(1, 2, 3));
Map<String, InputSpecUpdate> rootInputSpecUpdates = new HashMap<String, InputSpecUpdate>();
rootInputSpecUpdates.put("input1", rootInputSpecUpdateBulk);
rootInputSpecUpdates.put("input2", rootInputSpecUpdatePerTask);
Map<String, EdgeProperty> sourceEdgeManagers = new HashMap<String, EdgeProperty>();
// add standard and custom edge
sourceEdgeManagers.put("foo", EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("Out1"), InputDescriptor.create("in1")));
sourceEdgeManagers.put("foo1", EdgeProperty.create(EdgeManagerPluginDescriptor.create("bar1").setUserPayload(UserPayload.create(ByteBuffer.wrap(new String("payload").getBytes()), 100)), DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("Out1"), InputDescriptor.create("in1")));
final long reconfigureDoneTime = 100;
final int numTasks = 2;
VertexConfigurationDoneEvent event = new VertexConfigurationDoneEvent(TezVertexID.getInstance(TezDAGID.getInstance(ApplicationId.newInstance(0, 1), 1), 111), reconfigureDoneTime, numTasks, vertexLocationHint, sourceEdgeManagers, rootInputSpecUpdates, true);
Assert.assertEquals(numTasks, event.getNumTasks());
Assert.assertEquals(reconfigureDoneTime, event.getReconfigureDoneTime());
VertexConfigurationDoneEvent deserializedEvent = (VertexConfigurationDoneEvent) testProtoConversion(event);
Assert.assertEquals(event.getVertexID(), deserializedEvent.getVertexID());
Assert.assertEquals(event.getNumTasks(), deserializedEvent.getNumTasks());
Assert.assertEquals(event.isSetParallelismCalled(), deserializedEvent.isSetParallelismCalled());
// vertexLocationHint
Assert.assertEquals(event.getVertexLocationHint(), deserializedEvent.getVertexLocationHint());
// rootInputSpec
Assert.assertEquals(event.getRootInputSpecUpdates().size(), deserializedEvent.getRootInputSpecUpdates().size());
InputSpecUpdate deserializedBulk = deserializedEvent.getRootInputSpecUpdates().get("input1");
InputSpecUpdate deserializedPerTask = deserializedEvent.getRootInputSpecUpdates().get("input2");
Assert.assertEquals(rootInputSpecUpdateBulk.isForAllWorkUnits(), deserializedBulk.isForAllWorkUnits());
Assert.assertEquals(rootInputSpecUpdateBulk.getAllNumPhysicalInputs(), deserializedBulk.getAllNumPhysicalInputs());
Assert.assertEquals(rootInputSpecUpdatePerTask.isForAllWorkUnits(), deserializedPerTask.isForAllWorkUnits());
Assert.assertEquals(rootInputSpecUpdatePerTask.getAllNumPhysicalInputs(), deserializedPerTask.getAllNumPhysicalInputs());
// sourceEdgeManager
Assert.assertEquals(event.getSourceEdgeProperties().size(), deserializedEvent.getSourceEdgeProperties().size());
Assert.assertEquals(event.getSourceEdgeProperties().get("foo").getDataMovementType(), deserializedEvent.getSourceEdgeProperties().get("foo").getDataMovementType());
Assert.assertNull(deserializedEvent.getSourceEdgeProperties().get("foo").getEdgeManagerDescriptor());
Assert.assertEquals(event.getSourceEdgeProperties().get("foo1").getDataMovementType(), deserializedEvent.getSourceEdgeProperties().get("foo1").getDataMovementType());
Assert.assertEquals(event.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().getVersion(), deserializedEvent.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().getVersion());
Assert.assertArrayEquals(event.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().deepCopyAsArray(), deserializedEvent.getSourceEdgeProperties().get("foo1").getEdgeManagerDescriptor().getUserPayload().deepCopyAsArray());
logEvents(event, deserializedEvent);
}
use of org.apache.tez.dag.api.EdgeProperty in project tez by apache.
the class HashJoinExample method createDag.
private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions, boolean doBroadcast) throws IOException {
DAG dag = DAG.create("HashJoinExample" + (doBroadcast ? "-WithBroadcast" : ""));
/**
* This vertex represents the side of the join that will be accumulated in a
* hash table in order to join it against the other side. It reads text data
* using the TextInputFormat. ForwardingProcessor simply forwards the data
* downstream as is.
*/
Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, hashPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents that side of the data that will be streamed and
* joined against the other side that has been accumulated into a hash
* table. It reads text data using the TextInputFormat. ForwardingProcessor
* simply forwards the data downstream as is.
*/
Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, streamPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents the join operation. It writes the join output as
* text using the TextOutputFormat. The JoinProcessor is going to perform
* the join of the streaming side and the hash side. It is load balanced
* across numPartitions
*/
Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(HashJoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
/**
* The streamed side will be partitioned into fragments with the same keys
* going to the same fragments using hash partitioning. The data to be
* joined is the key itself and so the value is null. The number of
* fragments is initially inferred from the number of tasks running in the
* join vertex because each task will be handling one fragment. The
* setFromConfiguration call is optional and allows overriding the config
* options with command line parameters.
*/
UnorderedPartitionedKVEdgeConfig streamConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
/**
* Connect the join vertex with the stream side
*/
Edge e1 = Edge.create(streamFileVertex, joinVertex, streamConf.createDefaultEdgeProperty());
EdgeProperty hashSideEdgeProperty = null;
if (doBroadcast) {
/**
* This option can be used when the hash side is small. We can broadcast
* the entire data to all fragments of the stream side. This avoids
* re-partitioning the fragments of the stream side to match the
* partitioning scheme of the hash side and avoids costly network data
* transfer. However, in this example the stream side is being partitioned
* in both cases for brevity of code. The join task can perform the join
* of its fragment of keys with all the keys of the hash side. Using an
* unpartitioned edge to transfer the complete output of the hash side to
* be broadcasted to all fragments of the streamed side. Again, since the
* data is the key, the value is null. The setFromConfiguration call is
* optional and allows overriding the config options with command line
* parameters.
*/
UnorderedKVEdgeConfig broadcastConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName()).setFromConfiguration(tezConf).build();
hashSideEdgeProperty = broadcastConf.createDefaultBroadcastEdgeProperty();
} else {
/**
* The hash side is also being partitioned into fragments with the same
* key going to the same fragment using hash partitioning. This way all
* keys with the same hash value will go to the same fragment from both
* sides. Thus the join task handling that fragment can join both data set
* fragments.
*/
hashSideEdgeProperty = streamConf.createDefaultEdgeProperty();
}
/**
* Connect the join vertex to the hash side. The join vertex is connected
* with 2 upstream vertices that provide it with inputs
*/
Edge e2 = Edge.create(hashFileVertex, joinVertex, hashSideEdgeProperty);
/**
* Connect everything up by adding them to the DAG
*/
dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(joinVertex).addEdge(e1).addEdge(e2);
return dag;
}
Aggregations