use of org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig in project hive by apache.
the class DagUtils method createEdgeProperty.
/*
* Helper function to create an edge property from an edge type.
*/
private EdgeProperty createEdgeProperty(TezEdgeProperty edgeProp, Configuration conf) throws IOException {
MRHelpers.translateMRConfToTez(conf);
String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
String partitionerClassName = conf.get("mapred.partitioner.class");
Map<String, String> partitionerConf;
EdgeType edgeType = edgeProp.getEdgeType();
switch(edgeType) {
case BROADCAST_EDGE:
UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et1Conf.createDefaultBroadcastEdgeProperty();
case CUSTOM_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
DataOutputBuffer dob = new DataOutputBuffer();
edgeConf.write(dob);
byte[] userPayload = dob.getData();
edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
case CUSTOM_SIMPLE_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et3Conf.createDefaultEdgeProperty();
case SIMPLE_EDGE:
default:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
OrderedPartitionedKVEdgeConfig et4Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et4Conf.createDefaultEdgeProperty();
}
}
use of org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig in project tez by apache.
the class BroadcastLoadGen method createDAG.
private DAG createDAG(int numGenTasks, int totalSourceDataSize, int numFetcherTasks) {
int bytesPerSource = totalSourceDataSize / numGenTasks;
LOG.info("DataPerSourceTask(bytes)=" + bytesPerSource);
ByteBuffer payload = ByteBuffer.allocate(4);
payload.putInt(0, bytesPerSource);
Vertex broadcastVertex = Vertex.create("DataGen", ProcessorDescriptor.create(InputGenProcessor.class.getName()).setUserPayload(UserPayload.create(payload)), numGenTasks);
Vertex fetchVertex = Vertex.create("FetchVertex", ProcessorDescriptor.create(InputFetchProcessor.class.getName()), numFetcherTasks);
UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(NullWritable.class.getName(), IntWritable.class.getName()).setCompression(false, null, null).build();
DAG dag = DAG.create("BroadcastLoadGen");
dag.addVertex(broadcastVertex).addVertex(fetchVertex).addEdge(Edge.create(broadcastVertex, fetchVertex, edgeConf.createDefaultBroadcastEdgeProperty()));
return dag;
}
use of org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig in project tez by apache.
the class FilterLinesByWordOneToOne method run.
@Override
public int run(String[] otherArgs) throws Exception {
boolean generateSplitsInClient = false;
SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser();
try {
generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false);
otherArgs = splitCmdLineParser.getRemainingArgs();
} catch (ParseException e1) {
System.err.println("Invalid options");
printUsage();
return 2;
}
if (otherArgs.length != 3) {
printUsage();
return 2;
}
String inputPath = otherArgs[0];
String outputPath = otherArgs[1];
String filterWord = otherArgs[2];
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(outputPath))) {
System.err.println("Output directory : " + outputPath + " already exists");
return 2;
}
TezConfiguration tezConf = new TezConfiguration(conf);
fs.getWorkingDirectory();
Path stagingDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString());
tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDir.toString());
TezClientUtils.ensureStagingDirExists(tezConf, stagingDir);
String jarPath = ClassUtil.findContainingJar(FilterLinesByWordOneToOne.class);
if (jarPath == null) {
throw new TezUncheckedException("Could not find any jar containing" + FilterLinesByWordOneToOne.class.getName() + " in the classpath");
}
Path remoteJarPath = fs.makeQualified(new Path(stagingDir, "dag_job.jar"));
fs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
FileStatus remoteJarStatus = fs.getFileStatus(remoteJarPath);
Map<String, LocalResource> commonLocalResources = new TreeMap<String, LocalResource>();
LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, remoteJarStatus.getLen(), remoteJarStatus.getModificationTime());
commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);
TezClient tezSession = TezClient.create("FilterLinesByWordSession", tezConf, commonLocalResources, null);
// Why do I need to start the TezSession.
tezSession.start();
Configuration stage1Conf = new JobConf(conf);
stage1Conf.set(FILTER_PARAM_NAME, filterWord);
Configuration stage2Conf = new JobConf(conf);
stage2Conf.set(FileOutputFormat.OUTDIR, outputPath);
stage2Conf.setBoolean("mapred.mapper.new-api", false);
UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
// Setup stage1 Vertex
Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload)).addTaskLocalFiles(commonLocalResources);
DataSourceDescriptor dsd;
if (generateSplitsInClient) {
// TODO TEZ-1406. Dont' use MRInputLegacy
stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
stage1Conf.setBoolean("mapred.mapper.new-api", false);
dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);
} else {
dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath).groupSplits(false).build();
}
stage1Vertex.addDataSource("MRInput", dsd);
// Setup stage2 Vertex
Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(FilterByWordOutputProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), dsd.getNumberOfShards());
stage2Vertex.addTaskLocalFiles(commonLocalResources);
// Configure the Output for stage2
stage2Vertex.addDataSink("MROutput", DataSinkDescriptor.create(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null));
UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), TextLongPair.class.getName()).setFromConfiguration(tezConf).build();
DAG dag = DAG.create("FilterLinesByWord");
Edge edge = Edge.create(stage1Vertex, stage2Vertex, edgeConf.createDefaultOneToOneEdgeProperty());
dag.addVertex(stage1Vertex).addVertex(stage2Vertex).addEdge(edge);
LOG.info("Submitting DAG to Tez Session");
DAGClient dagClient = tezSession.submitDAG(dag);
LOG.info("Submitted DAG to Tez Session");
DAGStatus dagStatus = null;
String[] vNames = { "stage1", "stage2" };
try {
while (true) {
dagStatus = dagClient.getDAGStatus(null);
if (dagStatus.getState() == DAGStatus.State.RUNNING || dagStatus.getState() == DAGStatus.State.SUCCEEDED || dagStatus.getState() == DAGStatus.State.FAILED || dagStatus.getState() == DAGStatus.State.KILLED || dagStatus.getState() == DAGStatus.State.ERROR) {
break;
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
// continue;
}
}
while (dagStatus.getState() == DAGStatus.State.RUNNING) {
try {
ExampleDriver.printDAGStatus(dagClient, vNames);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// continue;
}
dagStatus = dagClient.getDAGStatus(null);
} catch (TezException e) {
LOG.error("Failed to get application progress. Exiting");
return -1;
}
}
} finally {
fs.delete(stagingDir, true);
tezSession.stop();
}
ExampleDriver.printDAGStatus(dagClient, vNames);
LOG.info("Application completed. " + "FinalState=" + dagStatus.getState());
return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1;
}
use of org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig in project tez by apache.
the class HashJoinExample method createDag.
private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions, boolean doBroadcast) throws IOException {
DAG dag = DAG.create("HashJoinExample" + (doBroadcast ? "-WithBroadcast" : ""));
/**
* This vertex represents the side of the join that will be accumulated in a
* hash table in order to join it against the other side. It reads text data
* using the TextInputFormat. ForwardingProcessor simply forwards the data
* downstream as is.
*/
Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, hashPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents that side of the data that will be streamed and
* joined against the other side that has been accumulated into a hash
* table. It reads text data using the TextInputFormat. ForwardingProcessor
* simply forwards the data downstream as is.
*/
Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(ForwardingProcessor.class.getName())).addDataSource(inputFile, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, streamPath.toUri().toString()).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build());
/**
* This vertex represents the join operation. It writes the join output as
* text using the TextOutputFormat. The JoinProcessor is going to perform
* the join of the streaming side and the hash side. It is load balanced
* across numPartitions
*/
Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(HashJoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build());
/**
* The streamed side will be partitioned into fragments with the same keys
* going to the same fragments using hash partitioning. The data to be
* joined is the key itself and so the value is null. The number of
* fragments is initially inferred from the number of tasks running in the
* join vertex because each task will be handling one fragment. The
* setFromConfiguration call is optional and allows overriding the config
* options with command line parameters.
*/
UnorderedPartitionedKVEdgeConfig streamConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
/**
* Connect the join vertex with the stream side
*/
Edge e1 = Edge.create(streamFileVertex, joinVertex, streamConf.createDefaultEdgeProperty());
EdgeProperty hashSideEdgeProperty = null;
if (doBroadcast) {
/**
* This option can be used when the hash side is small. We can broadcast
* the entire data to all fragments of the stream side. This avoids
* re-partitioning the fragments of the stream side to match the
* partitioning scheme of the hash side and avoids costly network data
* transfer. However, in this example the stream side is being partitioned
* in both cases for brevity of code. The join task can perform the join
* of its fragment of keys with all the keys of the hash side. Using an
* unpartitioned edge to transfer the complete output of the hash side to
* be broadcasted to all fragments of the streamed side. Again, since the
* data is the key, the value is null. The setFromConfiguration call is
* optional and allows overriding the config options with command line
* parameters.
*/
UnorderedKVEdgeConfig broadcastConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), NullWritable.class.getName()).setFromConfiguration(tezConf).build();
hashSideEdgeProperty = broadcastConf.createDefaultBroadcastEdgeProperty();
} else {
/**
* The hash side is also being partitioned into fragments with the same
* key going to the same fragment using hash partitioning. This way all
* keys with the same hash value will go to the same fragment from both
* sides. Thus the join task handling that fragment can join both data set
* fragments.
*/
hashSideEdgeProperty = streamConf.createDefaultEdgeProperty();
}
/**
* Connect the join vertex to the hash side. The join vertex is connected
* with 2 upstream vertices that provide it with inputs
*/
Edge e2 = Edge.create(hashFileVertex, joinVertex, hashSideEdgeProperty);
/**
* Connect everything up by adding them to the DAG
*/
dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(joinVertex).addEdge(e1).addEdge(e2);
return dag;
}
use of org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig in project hive by apache.
the class DagUtils method createEdgeProperty.
/*
* Helper function to create an edge property from an edge type.
*/
private EdgeProperty createEdgeProperty(Vertex w, TezEdgeProperty edgeProp, Configuration conf, BaseWork work, TezWork tezWork) throws IOException {
MRHelpers.translateMRConfToTez(conf);
String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
String partitionerClassName = conf.get("mapred.partitioner.class");
Map<String, String> partitionerConf;
EdgeType edgeType = edgeProp.getEdgeType();
switch(edgeType) {
case BROADCAST_EDGE:
UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et1Conf.createDefaultBroadcastEdgeProperty();
case CUSTOM_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
DataOutputBuffer dob = new DataOutputBuffer();
edgeConf.write(dob);
byte[] userPayload = dob.getData();
edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
case CUSTOM_SIMPLE_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig.Builder et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null);
if (edgeProp.getBufferSize() != null) {
et3Conf.setAdditionalConfiguration(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, edgeProp.getBufferSize().toString());
}
return et3Conf.build().createDefaultEdgeProperty();
case ONE_TO_ONE_EDGE:
UnorderedKVEdgeConfig et4Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et4Conf.createDefaultOneToOneEdgeProperty();
case XPROD_EDGE:
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(work)) {
if (EdgeType.XPROD_EDGE == tezWork.getEdgeType(parentWork, work)) {
crossProductSources.add(parentWork.getName());
}
}
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
edgeManagerDescriptor.setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf)));
UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, ValueHashPartitioner.class.getName()).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return cpEdgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
case SIMPLE_EDGE:
// fallthrough
default:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
OrderedPartitionedKVEdgeConfig et5Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et5Conf.createDefaultEdgeProperty();
}
}
Aggregations