use of org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat in project flink by apache.
the class WordCount method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: WordCount <input path> <result path>");
return;
}
final String inputPath = args[0];
final String outputPath = args[1];
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Set up the Hadoop Input Format
Job job = Job.getInstance();
HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
TextInputFormat.addInputPath(job, new Path(inputPath));
// Create a Flink job with it
DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
// Tokenize the line and convert from Writable "Text" to String for better handling
DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
// Sum up the words
DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
// Convert String back to Writable "Text" for use with Hadoop Output Format
DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
// Set up Hadoop Output Format
HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", // set the value for both, since this test
" ");
TextOutputFormat.setOutputPath(job, new Path(outputPath));
// Output & Execute
hadoopResult.output(hadoopOutputFormat);
env.execute("Word Count");
}
use of org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat in project gora by apache.
the class GoraFlinkEngine method createDataSource.
public DataSource<Tuple2<KeyIn, ValueIn>> createDataSource(ExecutionEnvironment env, Configuration conf, Class<? extends DataStore<KeyIn, ValueIn>> dataStoreClass) throws IOException {
Preconditions.checkNotNull(classKeyIn);
Preconditions.checkNotNull(classValueIn);
Job job = Job.getInstance(conf);
DataStore<KeyIn, ValueIn> dataStore = DataStoreFactory.getDataStore(dataStoreClass, classKeyIn, classValueIn, job.getConfiguration());
GoraInputFormat.setInput(job, dataStore.newQuery(), true);
HadoopInputFormat<KeyIn, ValueIn> wrappedGoraInput = new HadoopInputFormat<>(new GoraInputFormat<>(), classKeyIn, classValueIn, job);
return env.createInput(wrappedGoraInput);
}
use of org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat in project gora by apache.
the class GoraFlinkEngine method createDataSource.
public DataSource<Tuple2<KeyIn, ValueIn>> createDataSource(ExecutionEnvironment env, Configuration conf, DataStore<KeyIn, ValueIn> dataStore) throws IOException {
Preconditions.checkNotNull(classKeyIn);
Preconditions.checkNotNull(classValueIn);
Job job = Job.getInstance(conf);
GoraInputFormat.setInput(job, dataStore.newQuery(), true);
HadoopInputFormat<KeyIn, ValueIn> wrappedGoraInput = new HadoopInputFormat<>(new GoraInputFormat<>(), classKeyIn, classValueIn, job);
return env.createInput(wrappedGoraInput);
}
Aggregations