Search in sources :

Example 6 with Function2

use of org.apache.spark.api.java.function.Function2 in project learning-spark by databricks.

the class PerKeyAvg method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    List<Tuple2<String, Integer>> input = new ArrayList();
    input.add(new Tuple2("coffee", 1));
    input.add(new Tuple2("coffee", 2));
    input.add(new Tuple2("pandas", 3));
    JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
    Function<Integer, AvgCount> createAcc = new Function<Integer, AvgCount>() {

        @Override
        public AvgCount call(Integer x) {
            return new AvgCount(x, 1);
        }
    };
    Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, Integer x) {
            a.total_ += x;
            a.num_ += 1;
            return a;
        }
    };
    Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, AvgCount b) {
            a.total_ += b.total_;
            a.num_ += b.num_;
            return a;
        }
    };
    AvgCount initial = new AvgCount(0, 0);
    JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(createAcc, addAndCount, combine);
    Map<String, AvgCount> countMap = avgCounts.collectAsMap();
    for (Entry<String, AvgCount> entry : countMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue().avg());
    }
}
Also used : ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) Function(org.apache.spark.api.java.function.Function) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 7 with Function2

use of org.apache.spark.api.java.function.Function2 in project learning-spark by databricks.

the class BasicAvgMapPartitions method run.

public void run(String master) {
    JavaSparkContext sc = new JavaSparkContext(master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
    FlatMapFunction<Iterator<Integer>, AvgCount> setup = new FlatMapFunction<Iterator<Integer>, AvgCount>() {

        @Override
        public Iterable<AvgCount> call(Iterator<Integer> input) {
            AvgCount a = new AvgCount(0, 0);
            while (input.hasNext()) {
                a.total_ += input.next();
                a.num_ += 1;
            }
            ArrayList<AvgCount> ret = new ArrayList<AvgCount>();
            ret.add(a);
            return ret;
        }
    };
    Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, AvgCount b) {
            a.total_ += b.total_;
            a.num_ += b.num_;
            return a;
        }
    };
    AvgCount result = rdd.mapPartitions(setup).reduce(combine);
    System.out.println(result.avg());
}
Also used : FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Function2(org.apache.spark.api.java.function.Function2)

Example 8 with Function2

use of org.apache.spark.api.java.function.Function2 in project java_study by aloyschen.

the class RDD method reduce.

/*
    * spark reduce example
     */
public void reduce() {
    JavaSparkContext sc = getSc();
    sc.setLogLevel("ERROR");
    JavaRDD<Integer> reduce_rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
    Integer sum = reduce_rdd.reduce((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);
    System.out.println("the sum is " + sum);
}
Also used : Arrays(java.util.Arrays) Function2(org.apache.spark.api.java.function.Function2) Serializable(scala.Serializable) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 9 with Function2

use of org.apache.spark.api.java.function.Function2 in project java_study by aloyschen.

the class RDD method combine_key.

public void combine_key() {
    JavaSparkContext sc = getSc();
    sc.setLogLevel("ERROR");
    List<Tuple2<String, Integer>> input = new ArrayList<>();
    input.add(new Tuple2<>("Java", 1));
    input.add(new Tuple2<>("C++", 1));
    input.add(new Tuple2<>("Java", 2));
    input.add(new Tuple2<>("Java", 1));
    JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
    Function<Integer, AvgCount> creatAvg = (Function<Integer, AvgCount>) x -> new AvgCount(x, 1);
    Function2<AvgCount, Integer, AvgCount> addAndCount = (Function2<AvgCount, Integer, AvgCount>) (a, x) -> {
        a.total_ += x;
        a.count_ += 1;
        return a;
    };
    Function2<AvgCount, AvgCount, AvgCount> combine = (Function2<AvgCount, AvgCount, AvgCount>) (a, b) -> {
        a.count_ += b.count_;
        a.total_ += b.total_;
        return a;
    };
    AvgCount initial = new AvgCount(0, 0);
    JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(creatAvg, addAndCount, combine);
    Map<String, AvgCount> countMap = avgCounts.collectAsMap();
    for (Map.Entry<String, AvgCount> entry : countMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue().avg());
    }
}
Also used : ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Map(java.util.Map)

Example 10 with Function2

use of org.apache.spark.api.java.function.Function2 in project cdap by caskdata.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    String inputFile = args[0];
    String outputFile = args[1];
    // Create a Java Spark Context.
    SparkConf conf = new SparkConf().setAppName("wordCount");
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Load our input data, assuming each line is one word
    JavaRDD<String> words = sc.textFile(inputFile);
    // Transform into word and count.
    JavaRDD<String> counts = words.mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String x) {
            return new Tuple2<>(x, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
        }
    }).map(new Function<Tuple2<String, Integer>, String>() {

        @Override
        public String call(Tuple2<String, Integer> input) throws Exception {
            return input._1() + " " + input._2();
        }
    });
    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile(outputFile);
}
Also used : Function2(org.apache.spark.api.java.function.Function2) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Aggregations

Function2 (org.apache.spark.api.java.function.Function2)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)10 Tuple2 (scala.Tuple2)7 ArrayList (java.util.ArrayList)6 SparkConf (org.apache.spark.SparkConf)5 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)5 Function (org.apache.spark.api.java.function.Function)4 PairFunction (org.apache.spark.api.java.function.PairFunction)4 IOException (java.io.IOException)3 Map (java.util.Map)3 ESDriver (org.apache.sdap.mudrod.driver.ESDriver)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Arrays (java.util.Arrays)2 Iterator (java.util.Iterator)2 List (java.util.List)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 TxRunnable (co.cask.cdap.api.TxRunnable)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1 Schema (co.cask.cdap.api.data.schema.Schema)1