use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project flink by apache.
the class WordCountMapreduceITCase method internalRun.
private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<LongWritable, Text>> input;
if (isTestDeprecatedAPI) {
input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
} else {
input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
}
DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
@Override
public String map(Tuple2<LongWritable, Text> value) throws Exception {
return value.f1.toString();
}
});
DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).groupBy(0).sum(1);
DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {
@Override
public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
}
});
// Set up Hadoop Output Format
Job job = Job.getInstance();
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
job.getConfiguration().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(job, new Path(resultPath));
// Output & Execute
words.output(hadoopOutputFormat);
env.execute("Hadoop Compat WordCount");
}
use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project flink by apache.
the class WordCount method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: WordCount <input path> <result path>");
return;
}
final String inputPath = args[0];
final String outputPath = args[1];
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Set up the Hadoop Input Format
Job job = Job.getInstance();
HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
TextInputFormat.addInputPath(job, new Path(inputPath));
// Create a Flink job with it
DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
// Tokenize the line and convert from Writable "Text" to String for better handling
DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
// Sum up the words
DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
// Convert String back to Writable "Text" for use with Hadoop Output Format
DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
// Set up Hadoop Output Format
HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", // set the value for both, since this test
" ");
TextOutputFormat.setOutputPath(job, new Path(outputPath));
// Output & Execute
hadoopResult.output(hadoopOutputFormat);
env.execute("Word Count");
}
use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project flink by apache.
the class WordCountMapreduceITCase method internalRun.
private void internalRun() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<LongWritable, Text>> input;
input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
@Override
public String map(Tuple2<LongWritable, Text> value) throws Exception {
return value.f1.toString();
}
});
DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).groupBy(0).sum(1);
DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {
@Override
public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
}
});
// Set up Hadoop Output Format
Job job = Job.getInstance();
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
job.getConfiguration().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(job, new Path(resultPath));
// Output & Execute
words.output(hadoopOutputFormat);
env.execute("Hadoop Compat WordCount");
}
use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project gora by apache.
the class GoraFlinkEngine method createDataSink.
public OutputFormat<Tuple2<KeyOut, ValueOut>> createDataSink(Configuration conf, DataStore<KeyOut, ValueOut> dataStore) throws IOException {
Preconditions.checkNotNull(classKeyOut);
Preconditions.checkNotNull(classValueOut);
Job job = Job.getInstance(conf);
GoraOutputFormat.setOutput(job, dataStore, true);
HadoopOutputFormat<KeyOut, ValueOut> wrappedGoraOutput = new HadoopOutputFormat<>(new GoraOutputFormat<>(), job);
// Temp fix to prevent NullPointerException from Flink side.
Path tempPath = Files.createTempDirectory("temp");
job.getConfiguration().set("mapred.output.dir", tempPath.toAbsolutePath().toString());
return wrappedGoraOutput;
}
use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project gora by apache.
the class GoraFlinkEngine method createDataSink.
public OutputFormat<Tuple2<KeyOut, ValueOut>> createDataSink(Configuration conf, Class<? extends DataStore<KeyOut, ValueOut>> dataStoreClass) throws IOException {
Preconditions.checkNotNull(classKeyOut);
Preconditions.checkNotNull(classValueOut);
Job job = Job.getInstance(conf);
DataStore<KeyOut, ValueOut> dataStore = DataStoreFactory.getDataStore(dataStoreClass, classKeyOut, classValueOut, job.getConfiguration());
GoraOutputFormat.setOutput(job, dataStore, true);
HadoopOutputFormat<KeyOut, ValueOut> wrappedGoraOutput = new HadoopOutputFormat<>(new GoraOutputFormat<>(), job);
// Temp fix to prevent NullPointerException from Flink side.
Path tempPath = Files.createTempDirectory("temp");
job.getConfiguration().set("mapred.output.dir", tempPath.toAbsolutePath().toString());
return wrappedGoraOutput;
}
Aggregations