Search in sources :

Example 1 with MRInput

use of org.apache.tez.mapreduce.input.MRInput in project tez by apache.

the class UnionExample method createDAG.

private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
    DAG dag = DAG.create("UnionExample");
    int numMaps = -1;
    Configuration inputConf = new Configuration(tezConf);
    inputConf.setBoolean("mapred.mapper.new-api", false);
    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
    Configuration outputConf = new Configuration(tezConf);
    outputConf.setBoolean("mapred.reducer.new-api", false);
    outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
    outputConf.set(FileOutputFormat.OUTDIR, outputPath);
    DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
    checkerVertex.addDataSink("union", od);
    Configuration allPartsConf = new Configuration(tezConf);
    DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
    checkerVertex.addDataSink("all-parts", od2);
    Configuration partsConf = new Configuration(tezConf);
    DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
    VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
    unionVertex.addDataSink("parts", od1);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
    return dag;
}
Also used : MRInput(org.apache.tez.mapreduce.input.MRInput) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) VertexGroup(org.apache.tez.dag.api.VertexGroup) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) ConcatenatedMergedKeyValuesInput(org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValuesInput) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 2 with MRInput

use of org.apache.tez.mapreduce.input.MRInput in project tez by apache.

the class FilterByWordInputProcessor method run.

@Override
public void run(Map<String, LogicalInput> _inputs, Map<String, LogicalOutput> _outputs) throws Exception {
    this.inputs = _inputs;
    this.outputs = _outputs;
    this.progressHelper = new ProgressHelper(this.inputs, getContext(), this.getClass().getSimpleName());
    if (_inputs.size() != 1) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single input");
    }
    if (_outputs.size() != 1) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single output");
    }
    for (LogicalInput input : _inputs.values()) {
        input.start();
    }
    for (LogicalOutput output : _outputs.values()) {
        output.start();
    }
    LogicalInput li = _inputs.values().iterator().next();
    if (!(li instanceof MRInput)) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with MRInput");
    }
    LogicalOutput lo = _outputs.values().iterator().next();
    if (!(lo instanceof UnorderedKVOutput)) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with OnFileUnorderedKVOutput");
    }
    progressHelper.scheduleProgressTaskService(0, 100);
    MRInputLegacy mrInput = (MRInputLegacy) li;
    mrInput.init();
    UnorderedKVOutput kvOutput = (UnorderedKVOutput) lo;
    Configuration updatedConf = mrInput.getConfigUpdates();
    Text srcFile = new Text();
    srcFile.set("UNKNOWN_FILENAME_IN_PROCESSOR");
    if (updatedConf != null) {
        String fileName = updatedConf.get(MRJobConfig.MAP_INPUT_FILE);
        if (fileName != null) {
            LOG.info("Processing file: " + fileName);
            srcFile.set(fileName);
        }
    }
    KeyValueReader kvReader = mrInput.getReader();
    KeyValueWriter kvWriter = kvOutput.getWriter();
    while (kvReader.next()) {
        Object key = kvReader.getCurrentKey();
        Object val = kvReader.getCurrentValue();
        Text valText = (Text) val;
        String readVal = valText.toString();
        if (readVal.contains(filterWord)) {
            LongWritable lineNum = (LongWritable) key;
            TextLongPair outVal = new TextLongPair(srcFile, lineNum);
            kvWriter.write(valText, outVal);
        }
    }
}
Also used : MRInput(org.apache.tez.mapreduce.input.MRInput) ProgressHelper(org.apache.tez.common.ProgressHelper) Configuration(org.apache.hadoop.conf.Configuration) TextLongPair(org.apache.tez.mapreduce.examples.FilterLinesByWord.TextLongPair) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) Text(org.apache.hadoop.io.Text) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) UnorderedKVOutput(org.apache.tez.runtime.library.output.UnorderedKVOutput) LogicalInput(org.apache.tez.runtime.api.LogicalInput) LongWritable(org.apache.hadoop.io.LongWritable) MRInputLegacy(org.apache.tez.mapreduce.input.MRInputLegacy)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)2 MRInput (org.apache.tez.mapreduce.input.MRInput)2 LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)1 TextOutputFormat (org.apache.hadoop.mapred.TextOutputFormat)1 ProgressHelper (org.apache.tez.common.ProgressHelper)1 DAG (org.apache.tez.dag.api.DAG)1 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)1 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)1 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)1 Vertex (org.apache.tez.dag.api.Vertex)1 VertexGroup (org.apache.tez.dag.api.VertexGroup)1 TextLongPair (org.apache.tez.mapreduce.examples.FilterLinesByWord.TextLongPair)1 MRInputLegacy (org.apache.tez.mapreduce.input.MRInputLegacy)1 LogicalInput (org.apache.tez.runtime.api.LogicalInput)1 LogicalOutput (org.apache.tez.runtime.api.LogicalOutput)1 KeyValueReader (org.apache.tez.runtime.library.api.KeyValueReader)1 KeyValueWriter (org.apache.tez.runtime.library.api.KeyValueWriter)1 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)1