Search in sources :

Example 1 with HadoopOutputFormat

use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project flink by apache.

the class WordCountMapreduceITCase method internalRun.

private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<LongWritable, Text>> input;
    if (isTestDeprecatedAPI) {
        input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
    } else {
        input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    }
    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {

        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });
    DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).groupBy(0).sum(1);
    DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

        @Override
        public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
            return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
        }
    });
    // Set up Hadoop Output Format
    Job job = Job.getInstance();
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
    job.getConfiguration().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(job, new Path(resultPath));
    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) Tokenizer(org.apache.flink.test.testfunctions.Tokenizer)

Example 2 with HadoopOutputFormat

use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project flink by apache.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;
    }
    final String inputPath = args[0];
    final String outputPath = args[1];
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // Set up the Hadoop Input Format
    Job job = Job.getInstance();
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
    TextInputFormat.addInputPath(job, new Path(inputPath));
    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
    hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
    hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", // set the value for both, since this test
    " ");
    TextOutputFormat.setOutputPath(job, new Path(outputPath));
    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) HadoopInputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) IntWritable(org.apache.hadoop.io.IntWritable)

Example 3 with HadoopOutputFormat

use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project flink by apache.

the class WordCountMapreduceITCase method internalRun.

private void internalRun() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<LongWritable, Text>> input;
    input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {

        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });
    DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).groupBy(0).sum(1);
    DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

        @Override
        public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
            return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
        }
    });
    // Set up Hadoop Output Format
    Job job = Job.getInstance();
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
    job.getConfiguration().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(job, new Path(resultPath));
    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job)

Example 4 with HadoopOutputFormat

use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project gora by apache.

the class GoraFlinkEngine method createDataSink.

public OutputFormat<Tuple2<KeyOut, ValueOut>> createDataSink(Configuration conf, DataStore<KeyOut, ValueOut> dataStore) throws IOException {
    Preconditions.checkNotNull(classKeyOut);
    Preconditions.checkNotNull(classValueOut);
    Job job = Job.getInstance(conf);
    GoraOutputFormat.setOutput(job, dataStore, true);
    HadoopOutputFormat<KeyOut, ValueOut> wrappedGoraOutput = new HadoopOutputFormat<>(new GoraOutputFormat<>(), job);
    // Temp fix to prevent NullPointerException from Flink side.
    Path tempPath = Files.createTempDirectory("temp");
    job.getConfiguration().set("mapred.output.dir", tempPath.toAbsolutePath().toString());
    return wrappedGoraOutput;
}
Also used : Path(java.nio.file.Path) Job(org.apache.hadoop.mapreduce.Job) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat)

Example 5 with HadoopOutputFormat

use of org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat in project gora by apache.

the class GoraFlinkEngine method createDataSink.

public OutputFormat<Tuple2<KeyOut, ValueOut>> createDataSink(Configuration conf, Class<? extends DataStore<KeyOut, ValueOut>> dataStoreClass) throws IOException {
    Preconditions.checkNotNull(classKeyOut);
    Preconditions.checkNotNull(classValueOut);
    Job job = Job.getInstance(conf);
    DataStore<KeyOut, ValueOut> dataStore = DataStoreFactory.getDataStore(dataStoreClass, classKeyOut, classValueOut, job.getConfiguration());
    GoraOutputFormat.setOutput(job, dataStore, true);
    HadoopOutputFormat<KeyOut, ValueOut> wrappedGoraOutput = new HadoopOutputFormat<>(new GoraOutputFormat<>(), job);
    // Temp fix to prevent NullPointerException from Flink side.
    Path tempPath = Files.createTempDirectory("temp");
    job.getConfiguration().set("mapred.output.dir", tempPath.toAbsolutePath().toString());
    return wrappedGoraOutput;
}
Also used : Path(java.nio.file.Path) Job(org.apache.hadoop.mapreduce.Job) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat)

Aggregations

HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat)5 Job (org.apache.hadoop.mapreduce.Job)5 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3 Path (org.apache.hadoop.fs.Path)3 LongWritable (org.apache.hadoop.io.LongWritable)3 Text (org.apache.hadoop.io.Text)3 TextInputFormat (org.apache.hadoop.mapreduce.lib.input.TextInputFormat)3 Path (java.nio.file.Path)2 HadoopInputFormat (org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat)1 Tokenizer (org.apache.flink.test.testfunctions.Tokenizer)1 IntWritable (org.apache.hadoop.io.IntWritable)1