use of org.apache.spark.api.java.function.Function2 in project cdap by caskdata.
the class SparkLogParser method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> runtimeArguments = sec.getRuntimeArguments();
String inputFileSet = runtimeArguments.get("input");
final String outputTable = runtimeArguments.get("output");
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {
@Override
public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
return SparkAppUsingGetDataset.parse(input._2());
}
}).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {
@Override
public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
return stats1.aggregate(stats2);
}
}).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {
@Override
public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
final Gson gson = new Gson();
return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
}
}));
}
});
// Collect all data to driver and write to dataset directly. That's the intend of the test.
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable kvTable = context.getDataset(outputTable);
for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
kvTable.write(entry.getKey(), entry.getValue());
}
}
});
}
use of org.apache.spark.api.java.function.Function2 in project cdap by caskdata.
the class ClassicSparkProgram method main.
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
List<StructuredRecord> records = new ArrayList<>();
for (int i = 1; i <= 10; i++) {
records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
}
// This test serialization of StructuredRecord as well as using custom kryo serializer
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {
@Override
public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
}
}).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {
@Override
public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
return tuple._1;
}
}).reduce(new Function2<MyInt, MyInt, MyInt>() {
@Override
public MyInt call(MyInt v1, MyInt v2) throws Exception {
return new MyInt(v1.toInt() + v2.toInt());
}
}).toInt();
if (result != 55) {
throw new Exception("Expected result to be 55");
}
}
use of org.apache.spark.api.java.function.Function2 in project incubator-sdap-mudrod by apache.
the class CrawlerDetection method checkByRateInParallel.
void checkByRateInParallel() throws InterruptedException, IOException {
JavaRDD<String> userRDD = getUserRDD(this.httpType);
LOG.info("Original User count: {}", userRDD.count());
int userCount = 0;
userCount = userRDD.mapPartitions((FlatMapFunction<Iterator<String>, Integer>) iterator -> {
ESDriver tmpES = new ESDriver(props);
tmpES.createBulkProcessor();
List<Integer> realUserNums = new ArrayList<>();
while (iterator.hasNext()) {
String s = iterator.next();
Integer realUser = checkByRate(tmpES, s);
realUserNums.add(realUser);
}
tmpES.destroyBulkProcessor();
tmpES.close();
return realUserNums.iterator();
}).reduce((Function2<Integer, Integer, Integer>) (a, b) -> a + b);
LOG.info("User count: {}", Integer.toString(userCount));
}
use of org.apache.spark.api.java.function.Function2 in project incubator-sdap-mudrod by apache.
the class SessionGenerator method genSessionByRefererInParallel.
public void genSessionByRefererInParallel(int timeThres) throws InterruptedException, IOException {
JavaRDD<String> userRDD = getUserRDD(this.cleanupType);
int sessionCount = 0;
sessionCount = userRDD.mapPartitions(new FlatMapFunction<Iterator<String>, Integer>() {
/**
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<Integer> call(Iterator<String> arg0) throws Exception {
ESDriver tmpES = new ESDriver(props);
tmpES.createBulkProcessor();
List<Integer> sessionNums = new ArrayList<>();
while (arg0.hasNext()) {
String s = arg0.next();
Integer sessionNum = genSessionByReferer(tmpES, s, timeThres);
sessionNums.add(sessionNum);
}
tmpES.destroyBulkProcessor();
tmpES.close();
return sessionNums.iterator();
}
}).reduce(new Function2<Integer, Integer, Integer>() {
/**
*/
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer a, Integer b) {
return a + b;
}
});
LOG.info("Initial Session count: {}", Integer.toString(sessionCount));
}
use of org.apache.spark.api.java.function.Function2 in project java_study by aloyschen.
the class RDD method fold_ep.
/*
* spark fold example
*/
public void fold_ep() {
JavaSparkContext sc = getSc();
sc.setLogLevel("ERROR");
JavaRDD<Integer> fold_ep = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
Integer sum = fold_ep.fold(0, (Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);
System.out.println("The sum is " + sum);
}
Aggregations