use of org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig in project hive by apache.
the class DagUtils method createEdgeProperty.
/*
* Helper function to create an edge property from an edge type.
*/
private EdgeProperty createEdgeProperty(TezEdgeProperty edgeProp, Configuration conf) throws IOException {
MRHelpers.translateMRConfToTez(conf);
String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
String partitionerClassName = conf.get("mapred.partitioner.class");
Map<String, String> partitionerConf;
EdgeType edgeType = edgeProp.getEdgeType();
switch(edgeType) {
case BROADCAST_EDGE:
UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et1Conf.createDefaultBroadcastEdgeProperty();
case CUSTOM_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
DataOutputBuffer dob = new DataOutputBuffer();
edgeConf.write(dob);
byte[] userPayload = dob.getData();
edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
case CUSTOM_SIMPLE_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et3Conf.createDefaultEdgeProperty();
case SIMPLE_EDGE:
default:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
OrderedPartitionedKVEdgeConfig et4Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et4Conf.createDefaultEdgeProperty();
}
}
use of org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig in project tez by apache.
the class HashJoinExample method createDag.
private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions, boolean doBroadcast) throws IOException {
DAG dag = DAG.create("HashJoinExample" + (doBroadcast ? "-WithBroadcast" : ""));
/**
* This vertex represents the side of the join that will be accumulated in a
* hash table in order to join it against the other side. It reads text data
* using the TextInputFormat. ForwardingProcessor simply forwards the data
* downstream as is.
*/
Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, hashPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents that side of the data that will be streamed and
* joined against the other side that has been accumulated into a hash
* table. It reads text data using the TextInputFormat. ForwardingProcessor
* simply forwards the data downstream as is.
*/
Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, streamPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents the join operation. It writes the join output as
* text using the TextOutputFormat. The JoinProcessor is going to perform
* the join of the streaming side and the hash side. It is load balanced
* across numPartitions
*/
Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(HashJoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
/**
* The streamed side will be partitioned into fragments with the same keys
* going to the same fragments using hash partitioning. The data to be
* joined is the key itself and so the value is null. The number of
* fragments is initially inferred from the number of tasks running in the
* join vertex because each task will be handling one fragment. The
* setFromConfiguration call is optional and allows overriding the config
* options with command line parameters.
*/
UnorderedPartitionedKVEdgeConfig streamConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
/**
* Connect the join vertex with the stream side
*/
Edge e1 = Edge.create(streamFileVertex, joinVertex, streamConf.createDefaultEdgeProperty());
EdgeProperty hashSideEdgeProperty = null;
if (doBroadcast) {
/**
* This option can be used when the hash side is small. We can broadcast
* the entire data to all fragments of the stream side. This avoids
* re-partitioning the fragments of the stream side to match the
* partitioning scheme of the hash side and avoids costly network data
* transfer. However, in this example the stream side is being partitioned
* in both cases for brevity of code. The join task can perform the join
* of its fragment of keys with all the keys of the hash side. Using an
* unpartitioned edge to transfer the complete output of the hash side to
* be broadcasted to all fragments of the streamed side. Again, since the
* data is the key, the value is null. The setFromConfiguration call is
* optional and allows overriding the config options with command line
* parameters.
*/
UnorderedKVEdgeConfig broadcastConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName()).setFromConfiguration(tezConf).build();
hashSideEdgeProperty = broadcastConf.createDefaultBroadcastEdgeProperty();
} else {
/**
* The hash side is also being partitioned into fragments with the same
* key going to the same fragment using hash partitioning. This way all
* keys with the same hash value will go to the same fragment from both
* sides. Thus the join task handling that fragment can join both data set
* fragments.
*/
hashSideEdgeProperty = streamConf.createDefaultEdgeProperty();
}
/**
* Connect the join vertex to the hash side. The join vertex is connected
* with 2 upstream vertices that provide it with inputs
*/
Edge e2 = Edge.create(hashFileVertex, joinVertex, hashSideEdgeProperty);
/**
* Connect everything up by adding them to the DAG
*/
dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(joinVertex).addEdge(e1).addEdge(e2);
return dag;
}
use of org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig in project tez by apache.
the class CartesianProduct method createDAG.
private DAG createDAG(TezConfiguration tezConf) throws IOException {
InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v1.addDataSource(INPUT, dataSourceDescriptor);
Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v2.addDataSource(INPUT, dataSourceDescriptor);
OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
v3.addDataSink(OUTPUT, dataSinkDescriptor);
v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
edgeManagerDescriptor.setUserPayload(userPayload);
UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
use of org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig in project hive by apache.
the class DagUtils method createEdgeProperty.
/*
* Helper function to create an edge property from an edge type.
*/
private EdgeProperty createEdgeProperty(Vertex w, TezEdgeProperty edgeProp, Configuration conf, BaseWork work, TezWork tezWork) throws IOException {
MRHelpers.translateMRConfToTez(conf);
String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
String partitionerClassName = conf.get("mapred.partitioner.class");
Map<String, String> partitionerConf;
EdgeType edgeType = edgeProp.getEdgeType();
switch(edgeType) {
case BROADCAST_EDGE:
UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et1Conf.createDefaultBroadcastEdgeProperty();
case CUSTOM_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
DataOutputBuffer dob = new DataOutputBuffer();
edgeConf.write(dob);
byte[] userPayload = dob.getData();
edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
case CUSTOM_SIMPLE_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig.Builder et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null);
if (edgeProp.getBufferSize() != null) {
et3Conf.setAdditionalConfiguration(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, edgeProp.getBufferSize().toString());
}
return et3Conf.build().createDefaultEdgeProperty();
case ONE_TO_ONE_EDGE:
UnorderedKVEdgeConfig et4Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et4Conf.createDefaultOneToOneEdgeProperty();
case XPROD_EDGE:
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(work)) {
if (EdgeType.XPROD_EDGE == tezWork.getEdgeType(parentWork, work)) {
crossProductSources.add(parentWork.getName());
}
}
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
edgeManagerDescriptor.setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf)));
UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, ValueHashPartitioner.class.getName()).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return cpEdgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
case SIMPLE_EDGE:
// fallthrough
default:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
OrderedPartitionedKVEdgeConfig et5Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et5Conf.createDefaultEdgeProperty();
}
}
use of org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig in project tez by apache.
the class CartesianProduct method createDAG.
private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2, String inputPath3, String outputPath, boolean isPartitioned) throws IOException {
Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
// turn off groupSplit so that each input file incurs one task
v1.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1).groupSplits(false).build());
Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v2.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2).groupSplits(false).build());
Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v3.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3).groupSplits(false).build());
CartesianProductConfig cartesianProductConfig;
if (isPartitioned) {
Map<String, Integer> vertexPartitionMap = new HashMap<>();
for (String vertex : cpSources) {
vertexPartitionMap.put(vertex, numPartition);
}
cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap);
} else {
cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources));
}
UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName()));
v4.addDataSink(OUTPUT, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build());
v4.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
EdgeManagerPluginDescriptor cpEdgeManager = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
cpEdgeManager.setUserPayload(userPayload);
EdgeProperty cpEdgeProperty;
if (isPartitioned) {
UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), CustomPartitioner.class.getName()).build();
cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
} else {
UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
}
EdgeProperty broadcastEdgeProperty;
UnorderedKVEdgeConfig broadcastEdgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty();
return DAG.create("CartesianProduct").addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4).addEdge(Edge.create(v1, v4, cpEdgeProperty)).addEdge(Edge.create(v2, v4, cpEdgeProperty)).addEdge(Edge.create(v3, v4, broadcastEdgeProperty));
}
Aggregations