use of org.apache.hadoop.mapreduce.lib.input.TextInputFormat in project flink by apache.
the class WordCountMapreduceITCase method internalRun.
private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<LongWritable, Text>> input;
if (isTestDeprecatedAPI) {
input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
} else {
input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
}
DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
@Override
public String map(Tuple2<LongWritable, Text> value) throws Exception {
return value.f1.toString();
}
});
DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).groupBy(0).sum(1);
DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {
@Override
public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
}
});
// Set up Hadoop Output Format
Job job = Job.getInstance();
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
job.getConfiguration().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(job, new Path(resultPath));
// Output & Execute
words.output(hadoopOutputFormat);
env.execute("Hadoop Compat WordCount");
}
use of org.apache.hadoop.mapreduce.lib.input.TextInputFormat in project tez by apache.
the class WordCount method createDAG.
private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions) throws IOException {
// Create the descriptor that describes the input data to Tez. Using MRInput to read text
// data from the given input path. The TextInputFormat is used to read the text data.
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build();
// Create a descriptor that describes the output data to Tez. Using MROoutput to write text
// data to the given output path. The TextOutputFormat is used to write the text data.
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
// Create a vertex that reads the data from the data source and tokenizes it using the
// TokenProcessor. The number of tasks that will do the work for this vertex will be decided
// using the information provided by the data source descriptor.
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);
// Create the edge that represents the movement and semantics of data between the producer
// Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in
// parallel the tokenized data will be partitioned by word such that a given word goes to the
// same partition. The counts for the words should be grouped together per word. To achieve this
// we can use an edge that contains an input/output pair that handles partitioning and grouping
// of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
// edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
// We specify the key, value and partitioner type. Here the key type is Text (for word), the
// value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
// object. The edge can be configured by configuring the input, output etc individually without
// using this helper. The setFromConfiguration call is optional and allows overriding the config
// options with command line parameters.
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
// Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
// The number of tasks that do the work of this vertex depends on the number of partitions used
// to distribute the sum processing. In this case, its been made configurable via the
// numPartitions parameter.
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions).addDataSink(OUTPUT, dataSink);
// No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise
// we would have to add the jars for this code as local files to the vertices.
// Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
DAG dag = DAG.create("WordCount");
dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
return dag;
}
use of org.apache.hadoop.mapreduce.lib.input.TextInputFormat in project flink by apache.
the class WordCount method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: WordCount <input path> <result path>");
return;
}
final String inputPath = args[0];
final String outputPath = args[1];
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Set up the Hadoop Input Format
Job job = Job.getInstance();
HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
TextInputFormat.addInputPath(job, new Path(inputPath));
// Create a Flink job with it
DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
// Tokenize the line and convert from Writable "Text" to String for better handling
DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
// Sum up the words
DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
// Convert String back to Writable "Text" for use with Hadoop Output Format
DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
// Set up Hadoop Output Format
HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", // set the value for both, since this test
" ");
TextOutputFormat.setOutputPath(job, new Path(outputPath));
// Output & Execute
hadoopResult.output(hadoopOutputFormat);
env.execute("Word Count");
}
use of org.apache.hadoop.mapreduce.lib.input.TextInputFormat in project flink by apache.
the class WordCountMapreduceITCase method internalRun.
private void internalRun() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<LongWritable, Text>> input;
input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
@Override
public String map(Tuple2<LongWritable, Text> value) throws Exception {
return value.f1.toString();
}
});
DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).groupBy(0).sum(1);
DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {
@Override
public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
}
});
// Set up Hadoop Output Format
Job job = Job.getInstance();
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
job.getConfiguration().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(job, new Path(resultPath));
// Output & Execute
words.output(hadoopOutputFormat);
env.execute("Hadoop Compat WordCount");
}
Aggregations