use of org.apache.tez.mapreduce.input.MRInput in project tez by apache.
the class UnionExample method createDAG.
private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
DAG dag = DAG.create("UnionExample");
int numMaps = -1;
Configuration inputConf = new Configuration(tezConf);
inputConf.setBoolean("mapred.mapper.new-api", false);
inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
Configuration outputConf = new Configuration(tezConf);
outputConf.setBoolean("mapred.reducer.new-api", false);
outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
outputConf.set(FileOutputFormat.OUTDIR, outputPath);
DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
checkerVertex.addDataSink("union", od);
Configuration allPartsConf = new Configuration(tezConf);
DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
checkerVertex.addDataSink("all-parts", od2);
Configuration partsConf = new Configuration(tezConf);
DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
unionVertex.addDataSink("parts", od1);
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
return dag;
}
use of org.apache.tez.mapreduce.input.MRInput in project tez by apache.
the class FilterByWordInputProcessor method run.
@Override
public void run(Map<String, LogicalInput> _inputs, Map<String, LogicalOutput> _outputs) throws Exception {
this.inputs = _inputs;
this.outputs = _outputs;
this.progressHelper = new ProgressHelper(this.inputs, getContext(), this.getClass().getSimpleName());
if (_inputs.size() != 1) {
throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single input");
}
if (_outputs.size() != 1) {
throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single output");
}
for (LogicalInput input : _inputs.values()) {
input.start();
}
for (LogicalOutput output : _outputs.values()) {
output.start();
}
LogicalInput li = _inputs.values().iterator().next();
if (!(li instanceof MRInput)) {
throw new IllegalStateException("FilterByWordInputProcessor processor can only work with MRInput");
}
LogicalOutput lo = _outputs.values().iterator().next();
if (!(lo instanceof UnorderedKVOutput)) {
throw new IllegalStateException("FilterByWordInputProcessor processor can only work with OnFileUnorderedKVOutput");
}
progressHelper.scheduleProgressTaskService(0, 100);
MRInputLegacy mrInput = (MRInputLegacy) li;
mrInput.init();
UnorderedKVOutput kvOutput = (UnorderedKVOutput) lo;
Configuration updatedConf = mrInput.getConfigUpdates();
Text srcFile = new Text();
srcFile.set("UNKNOWN_FILENAME_IN_PROCESSOR");
if (updatedConf != null) {
String fileName = updatedConf.get(MRJobConfig.MAP_INPUT_FILE);
if (fileName != null) {
LOG.info("Processing file: " + fileName);
srcFile.set(fileName);
}
}
KeyValueReader kvReader = mrInput.getReader();
KeyValueWriter kvWriter = kvOutput.getWriter();
while (kvReader.next()) {
Object key = kvReader.getCurrentKey();
Object val = kvReader.getCurrentValue();
Text valText = (Text) val;
String readVal = valText.toString();
if (readVal.contains(filterWord)) {
LongWritable lineNum = (LongWritable) key;
TextLongPair outVal = new TextLongPair(srcFile, lineNum);
kvWriter.write(valText, outVal);
}
}
}
Aggregations