Search in sources :

Example 31 with ExecutionEnvironment

use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.

the class HadoopMapredCompatWordCount method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;
    }
    final String inputPath = args[0];
    final String outputPath = args[1];
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));
    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
    DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())).groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));
    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));
    // Output & Execute
    words.output(hadoopOutputFormat).setParallelism(1);
    env.execute("Hadoop Compat WordCount");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) HadoopInputFormat(org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 32 with ExecutionEnvironment

use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;
    }
    final String inputPath = args[0];
    final String outputPath = args[1];
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // Set up the Hadoop Input Format
    Job job = Job.getInstance();
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
    TextInputFormat.addInputPath(job, new Path(inputPath));
    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
    hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
    // set the value for both, since this test
    hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(job, new Path(outputPath));
    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) HadoopInputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) IntWritable(org.apache.hadoop.io.IntWritable)

Example 33 with ExecutionEnvironment

use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.

the class HBaseWriteExample method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    if (!parseParameters(args)) {
        return;
    }
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // get input data
    DataSet<String> text = getTextDataSet(env);
    DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).groupBy(0).sum(1);
    // emit result
    Job job = Job.getInstance();
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, outputTableName);
    // TODO is "mapred.output.dir" really useful?
    job.getConfiguration().set("mapred.output.dir", HBaseFlinkTestConstants.TMP_DIR);
    counts.map(new RichMapFunction<Tuple2<String, Integer>, Tuple2<Text, Mutation>>() {

        private transient Tuple2<Text, Mutation> reuse;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            reuse = new Tuple2<Text, Mutation>();
        }

        @Override
        public Tuple2<Text, Mutation> map(Tuple2<String, Integer> t) throws Exception {
            reuse.f0 = new Text(t.f0);
            Put put = new Put(t.f0.getBytes(ConfigConstants.DEFAULT_CHARSET));
            put.add(HBaseFlinkTestConstants.CF_SOME, HBaseFlinkTestConstants.Q_SOME, Bytes.toBytes(t.f1));
            reuse.f1 = put;
            return reuse;
        }
    }).output(new HadoopOutputFormat<Text, Mutation>(new TableOutputFormat<Text>(), job));
    // execute program
    env.execute("WordCount (HBase sink) Example");
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) Text(org.apache.hadoop.io.Text) Put(org.apache.hadoop.hbase.client.Put) TableOutputFormat(org.apache.hadoop.hbase.mapreduce.TableOutputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Mutation(org.apache.hadoop.hbase.client.Mutation) Job(org.apache.hadoop.mapreduce.Job)

Example 34 with ExecutionEnvironment

use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.

the class HadoopMapFunctionITCase method testDataDuplicatingMapper.

@Test
public void testDataDuplicatingMapper() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
    DataSet<Tuple2<IntWritable, Text>> duplicatingFlatMapDs = ds.flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new DuplicatingMapper()));
    String resultPath = tempFolder.newFile().toURI().toString();
    duplicatingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
    env.execute();
    String expected = "(1,Hi)\n" + "(1,HI)\n" + "(2,Hello)\n" + "(2,HELLO)\n" + "(3,Hello world)\n" + "(3,HELLO WORLD)\n" + "(4,Hello world, how are you?)\n" + "(4,HELLO WORLD, HOW ARE YOU?)\n" + "(5,I am fine.)\n" + "(5,I AM FINE.)\n" + "(6,Luke Skywalker)\n" + "(6,LUKE SKYWALKER)\n" + "(7,Comment#1)\n" + "(7,COMMENT#1)\n" + "(8,Comment#2)\n" + "(8,COMMENT#2)\n" + "(9,Comment#3)\n" + "(9,COMMENT#3)\n" + "(10,Comment#4)\n" + "(10,COMMENT#4)\n" + "(11,Comment#5)\n" + "(11,COMMENT#5)\n" + "(12,Comment#6)\n" + "(12,COMMENT#6)\n" + "(13,Comment#7)\n" + "(13,COMMENT#7)\n" + "(14,Comment#8)\n" + "(14,COMMENT#8)\n" + "(15,Comment#9)\n" + "(15,COMMENT#9)\n" + "(16,Comment#10)\n" + "(16,COMMENT#10)\n" + "(17,Comment#11)\n" + "(17,COMMENT#11)\n" + "(18,Comment#12)\n" + "(18,COMMENT#12)\n" + "(19,Comment#13)\n" + "(19,COMMENT#13)\n" + "(20,Comment#14)\n" + "(20,COMMENT#14)\n" + "(21,Comment#15)\n" + "(21,COMMENT#15)\n";
    compareResultsByLinesInMemory(expected, resultPath);
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Text(org.apache.hadoop.io.Text) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 35 with ExecutionEnvironment

use of org.apache.flink.api.java.ExecutionEnvironment in project flink by apache.

the class HadoopMapFunctionITCase method testConfigurableMapper.

@Test
public void testConfigurableMapper() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    JobConf conf = new JobConf();
    conf.set("my.filterPrefix", "Hello");
    DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
    DataSet<Tuple2<IntWritable, Text>> hellos = ds.flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new ConfigurableMapper(), conf));
    String resultPath = tempFolder.newFile().toURI().toString();
    hellos.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
    env.execute();
    String expected = "(2,Hello)\n" + "(3,Hello world)\n" + "(4,Hello world, how are you?)\n";
    compareResultsByLinesInMemory(expected, resultPath);
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Text(org.apache.hadoop.io.Text) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Aggregations

ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)1247 Test (org.junit.Test)1090 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)374 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)264 Plan (org.apache.flink.api.common.Plan)238 Tuple5 (org.apache.flink.api.java.tuple.Tuple5)236 OptimizedPlan (org.apache.flink.optimizer.plan.OptimizedPlan)199 SinkPlanNode (org.apache.flink.optimizer.plan.SinkPlanNode)139 InvalidProgramException (org.apache.flink.api.common.InvalidProgramException)138 Vertex (org.apache.flink.graph.Vertex)93 SingleInputPlanNode (org.apache.flink.optimizer.plan.SingleInputPlanNode)73 Edge (org.apache.flink.graph.Edge)70 DualInputPlanNode (org.apache.flink.optimizer.plan.DualInputPlanNode)66 ArrayList (java.util.ArrayList)57 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)49 SourcePlanNode (org.apache.flink.optimizer.plan.SourcePlanNode)44 DiscardingOutputFormat (org.apache.flink.api.java.io.DiscardingOutputFormat)39 BatchTableEnvironment (org.apache.flink.table.api.java.BatchTableEnvironment)38 FieldSet (org.apache.flink.api.common.operators.util.FieldSet)37 JobGraphGenerator (org.apache.flink.optimizer.plantranslate.JobGraphGenerator)35