use of org.apache.spark.api.java.function.Function2 in project learning-spark by databricks.
the class PerKeyAvg method main.
public static void main(String[] args) throws Exception {
String master;
if (args.length > 0) {
master = args[0];
} else {
master = "local";
}
JavaSparkContext sc = new JavaSparkContext(master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
List<Tuple2<String, Integer>> input = new ArrayList();
input.add(new Tuple2("coffee", 1));
input.add(new Tuple2("coffee", 2));
input.add(new Tuple2("pandas", 3));
JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
Function<Integer, AvgCount> createAcc = new Function<Integer, AvgCount>() {
@Override
public AvgCount call(Integer x) {
return new AvgCount(x, 1);
}
};
Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {
@Override
public AvgCount call(AvgCount a, Integer x) {
a.total_ += x;
a.num_ += 1;
return a;
}
};
Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
@Override
public AvgCount call(AvgCount a, AvgCount b) {
a.total_ += b.total_;
a.num_ += b.num_;
return a;
}
};
AvgCount initial = new AvgCount(0, 0);
JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(createAcc, addAndCount, combine);
Map<String, AvgCount> countMap = avgCounts.collectAsMap();
for (Entry<String, AvgCount> entry : countMap.entrySet()) {
System.out.println(entry.getKey() + ":" + entry.getValue().avg());
}
}
use of org.apache.spark.api.java.function.Function2 in project learning-spark by databricks.
the class BasicAvgMapPartitions method run.
public void run(String master) {
JavaSparkContext sc = new JavaSparkContext(master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
FlatMapFunction<Iterator<Integer>, AvgCount> setup = new FlatMapFunction<Iterator<Integer>, AvgCount>() {
@Override
public Iterable<AvgCount> call(Iterator<Integer> input) {
AvgCount a = new AvgCount(0, 0);
while (input.hasNext()) {
a.total_ += input.next();
a.num_ += 1;
}
ArrayList<AvgCount> ret = new ArrayList<AvgCount>();
ret.add(a);
return ret;
}
};
Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
@Override
public AvgCount call(AvgCount a, AvgCount b) {
a.total_ += b.total_;
a.num_ += b.num_;
return a;
}
};
AvgCount result = rdd.mapPartitions(setup).reduce(combine);
System.out.println(result.avg());
}
use of org.apache.spark.api.java.function.Function2 in project java_study by aloyschen.
the class RDD method reduce.
/*
* spark reduce example
*/
public void reduce() {
JavaSparkContext sc = getSc();
sc.setLogLevel("ERROR");
JavaRDD<Integer> reduce_rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
Integer sum = reduce_rdd.reduce((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);
System.out.println("the sum is " + sum);
}
use of org.apache.spark.api.java.function.Function2 in project java_study by aloyschen.
the class RDD method combine_key.
public void combine_key() {
JavaSparkContext sc = getSc();
sc.setLogLevel("ERROR");
List<Tuple2<String, Integer>> input = new ArrayList<>();
input.add(new Tuple2<>("Java", 1));
input.add(new Tuple2<>("C++", 1));
input.add(new Tuple2<>("Java", 2));
input.add(new Tuple2<>("Java", 1));
JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
Function<Integer, AvgCount> creatAvg = (Function<Integer, AvgCount>) x -> new AvgCount(x, 1);
Function2<AvgCount, Integer, AvgCount> addAndCount = (Function2<AvgCount, Integer, AvgCount>) (a, x) -> {
a.total_ += x;
a.count_ += 1;
return a;
};
Function2<AvgCount, AvgCount, AvgCount> combine = (Function2<AvgCount, AvgCount, AvgCount>) (a, b) -> {
a.count_ += b.count_;
a.total_ += b.total_;
return a;
};
AvgCount initial = new AvgCount(0, 0);
JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(creatAvg, addAndCount, combine);
Map<String, AvgCount> countMap = avgCounts.collectAsMap();
for (Map.Entry<String, AvgCount> entry : countMap.entrySet()) {
System.out.println(entry.getKey() + ":" + entry.getValue().avg());
}
}
use of org.apache.spark.api.java.function.Function2 in project cdap by caskdata.
the class WordCount method main.
public static void main(String[] args) throws Exception {
String inputFile = args[0];
String outputFile = args[1];
// Create a Java Spark Context.
SparkConf conf = new SparkConf().setAppName("wordCount");
JavaSparkContext sc = new JavaSparkContext(conf);
// Load our input data, assuming each line is one word
JavaRDD<String> words = sc.textFile(inputFile);
// Transform into word and count.
JavaRDD<String> counts = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String x) {
return new Tuple2<>(x, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) {
return x + y;
}
}).map(new Function<Tuple2<String, Integer>, String>() {
@Override
public String call(Tuple2<String, Integer> input) throws Exception {
return input._1() + " " + input._2();
}
});
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile(outputFile);
}
Aggregations