use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.
the class HadoopMapredCompatWordCount method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: WordCount <input path> <result path>");
return;
}
final String inputPath = args[0];
final String outputPath = args[1];
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Set up the Hadoop Input Format
HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));
// Create a Flink job with it
DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())).groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));
// Set up Hadoop Output Format
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));
// Output & Execute
words.output(hadoopOutputFormat).setParallelism(1);
env.execute("Hadoop Compat WordCount");
}
use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.
the class WordCount method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: WordCount <input path> <result path>");
return;
}
final String inputPath = args[0];
final String outputPath = args[1];
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Set up the Hadoop Input Format
Job job = Job.getInstance();
HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
TextInputFormat.addInputPath(job, new Path(inputPath));
// Create a Flink job with it
DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
// Tokenize the line and convert from Writable "Text" to String for better handling
DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
// Sum up the words
DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
// Convert String back to Writable "Text" for use with Hadoop Output Format
DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
// Set up Hadoop Output Format
HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
// set the value for both, since this test
hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(job, new Path(outputPath));
// Output & Execute
hadoopResult.output(hadoopOutputFormat);
env.execute("Word Count");
}
use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.
the class HBaseWriteExample method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// get input data
DataSet<String> text = getTextDataSet(env);
DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).groupBy(0).sum(1);
// emit result
Job job = Job.getInstance();
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, outputTableName);
// TODO is "mapred.output.dir" really useful?
job.getConfiguration().set("mapred.output.dir", HBaseFlinkTestConstants.TMP_DIR);
counts.map(new RichMapFunction<Tuple2<String, Integer>, Tuple2<Text, Mutation>>() {
private transient Tuple2<Text, Mutation> reuse;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
reuse = new Tuple2<Text, Mutation>();
}
@Override
public Tuple2<Text, Mutation> map(Tuple2<String, Integer> t) throws Exception {
reuse.f0 = new Text(t.f0);
Put put = new Put(t.f0.getBytes(ConfigConstants.DEFAULT_CHARSET));
put.add(HBaseFlinkTestConstants.CF_SOME, HBaseFlinkTestConstants.Q_SOME, Bytes.toBytes(t.f1));
reuse.f1 = put;
return reuse;
}
}).output(new HadoopOutputFormat<Text, Mutation>(new TableOutputFormat<Text>(), job));
// execute program
env.execute("WordCount (HBase sink) Example");
}
use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.
the class HadoopMapFunctionITCase method testDataDuplicatingMapper.
@Test
public void testDataDuplicatingMapper() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
DataSet<Tuple2<IntWritable, Text>> duplicatingFlatMapDs = ds.flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new DuplicatingMapper()));
String resultPath = tempFolder.newFile().toURI().toString();
duplicatingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
env.execute();
String expected = "(1,Hi)\n" + "(1,HI)\n" + "(2,Hello)\n" + "(2,HELLO)\n" + "(3,Hello world)\n" + "(3,HELLO WORLD)\n" + "(4,Hello world, how are you?)\n" + "(4,HELLO WORLD, HOW ARE YOU?)\n" + "(5,I am fine.)\n" + "(5,I AM FINE.)\n" + "(6,Luke Skywalker)\n" + "(6,LUKE SKYWALKER)\n" + "(7,Comment#1)\n" + "(7,COMMENT#1)\n" + "(8,Comment#2)\n" + "(8,COMMENT#2)\n" + "(9,Comment#3)\n" + "(9,COMMENT#3)\n" + "(10,Comment#4)\n" + "(10,COMMENT#4)\n" + "(11,Comment#5)\n" + "(11,COMMENT#5)\n" + "(12,Comment#6)\n" + "(12,COMMENT#6)\n" + "(13,Comment#7)\n" + "(13,COMMENT#7)\n" + "(14,Comment#8)\n" + "(14,COMMENT#8)\n" + "(15,Comment#9)\n" + "(15,COMMENT#9)\n" + "(16,Comment#10)\n" + "(16,COMMENT#10)\n" + "(17,Comment#11)\n" + "(17,COMMENT#11)\n" + "(18,Comment#12)\n" + "(18,COMMENT#12)\n" + "(19,Comment#13)\n" + "(19,COMMENT#13)\n" + "(20,Comment#14)\n" + "(20,COMMENT#14)\n" + "(21,Comment#15)\n" + "(21,COMMENT#15)\n";
compareResultsByLinesInMemory(expected, resultPath);
}
use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.
the class HadoopMapFunctionITCase method testConfigurableMapper.
@Test
public void testConfigurableMapper() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
JobConf conf = new JobConf();
conf.set("my.filterPrefix", "Hello");
DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
DataSet<Tuple2<IntWritable, Text>> hellos = ds.flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new ConfigurableMapper(), conf));
String resultPath = tempFolder.newFile().toURI().toString();
hellos.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
env.execute();
String expected = "(2,Hello)\n" + "(3,Hello world)\n" + "(4,Hello world, how are you?)\n";
compareResultsByLinesInMemory(expected, resultPath);
}
Aggregations