use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.
the class LogAnalyzerTotal method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
// Calculate statistics based on the content size, and update the static variables to track this.
accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
if (stats != null) {
runningCount.getAndAdd(stats._1());
runningSum.getAndAdd(stats._2());
runningMin.set(Math.min(runningMin.get(), stats._3()));
runningMax.set(Math.max(runningMax.get(), stats._4()));
}
return null;
}
});
// A DStream of Resonse Code Counts;
JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.responseCodeCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {
public Void call(JavaPairRDD<Integer, Long> rdd) {
currentResponseCodeCounts = rdd.take(100);
return null;
}
});
// A DStream of ipAddressCounts.
JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.ipAddressCount(rdd);
}
});
JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
// A DStream of ipAddressCounts without transform
JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
// and joining it with the transfer amount
JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
// Save our dstream of ip address request counts
JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {
public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
}
});
class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
}
;
writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
// All ips more than 10
JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {
public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
return Functions.filterIPAddress(rdd);
}
});
ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {
public Void call(JavaRDD<String> rdd) {
List<String> currentIPAddresses = rdd.take(100);
return null;
}
});
// A DStream of endpoint to count.
JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.endpointCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
Object ordering = Ordering.natural();
final Comparator<Long> cmp = (Comparator<Long>) ordering;
endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {
public Void call(JavaPairRDD<String, Long> rdd) {
currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
return null;
}
});
}
use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.
the class LogAnalyzerWindowed method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {
public String call(ApacheAccessLog entry) {
return entry.getIpAddress();
}
});
// reduceByWindow
JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {
public Long call(ApacheAccessLog entry) {
return 1L;
}
}).reduceByWindow(new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 + v2;
}
}, new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 - v2;
}
}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
requestCountRBW.print();
// reducebykeyandwindow
JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {
public Tuple2<String, Long> call(ApacheAccessLog entry) {
return new Tuple2(entry.getIpAddress(), 1L);
}
});
JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 + v2;
}
}, // Removing elements from the oldest slice
new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 - v2;
}
}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
ipCountDStream.print();
// Use countByWindow
JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
requestCount.print();
ipAddressRequestCount.print();
// use a transform for the response code count
JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
return Functions.responseCodeCount(logs);
}
});
windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
Object ordering = Ordering.natural();
Comparator<Long> cmp = (Comparator<Long>) ordering;
List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
return null;
}
});
}
use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.
the class KeyValueMapFilter method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
}
String master = args[0];
String inputFile = args[1];
JavaSparkContext sc = new JavaSparkContext(master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> input = sc.textFile(inputFile);
PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String x) {
return new Tuple2(x.split(" ")[0], x);
}
};
Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {
@Override
public Boolean call(Tuple2<String, String> input) {
return (input._2().length() < 20);
}
};
JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
Map<String, String> resultMap = result.collectAsMap();
for (Entry<String, String> entry : resultMap.entrySet()) {
System.out.println(entry.getKey() + ":" + entry.getValue());
}
}
use of org.apache.spark.api.java.function.PairFunction in project geode by apache.
the class RDDSaveJavaDemo method main.
public static void main(String[] argv) {
if (argv.length != 1) {
System.err.printf("Usage: RDDSaveJavaDemo <locators>\n");
return;
}
SparkConf conf = new SparkConf().setAppName("RDDSaveJavaDemo");
conf.set(GeodeLocatorPropKey, argv[0]);
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> data = new ArrayList<String>();
data.add("abcdefg");
data.add("abcdefgh");
data.add("abcdefghi");
JavaRDD<String> rdd = sc.parallelize(data);
GeodeConnectionConf connConf = GeodeConnectionConf.apply(conf);
PairFunction<String, String, Integer> func = new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, s.length());
}
};
javaFunctions(rdd).saveToGeode("str_int_region", func, connConf);
sc.stop();
}
use of org.apache.spark.api.java.function.PairFunction in project incubator-systemml by apache.
the class ApplyTfCSVSPARK method runSparkJob.
/**
* Apply transformation metadata and generate the result in CSV format, as a
* JavaRDD of Strings.
*
* @param sec spark execution context
* @param inputRDD input rdd
* @param tfMtdPath transform metadata path
* @param spec transform specification as json string
* @param tmpPath temporary file path
* @param prop csv file format properties
* @param numCols number of columns
* @param headerLine header line
* @return JavaPairRDD of long-strings
* @throws IOException if IOException occurs
* @throws ClassNotFoundException if ClassNotFoundException occurs
* @throws InterruptedException if InterruptedException occurs
* @throws IllegalArgumentException if IllegalArgumentException occurs
* @throws JSONException if JSONException occurs
*/
public static JavaPairRDD<Long, String> runSparkJob(SparkExecutionContext sec, JavaRDD<Tuple2<LongWritable, Text>> inputRDD, String tfMtdPath, String spec, String tmpPath, CSVFileFormatProperties prop, int numCols, String headerLine) throws IOException, ClassNotFoundException, InterruptedException, IllegalArgumentException, JSONException {
// Load transformation metadata and broadcast it
String[] naStrings = TfUtils.parseNAStrings(prop.getNAStrings());
JSONObject jspec = new JSONObject(spec);
TfUtils _tfmapper = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), naStrings, jspec, numCols, tfMtdPath, null, tmpPath);
_tfmapper.loadTfMetadata();
Broadcast<TfUtils> bcast_tf = sec.getSparkContext().broadcast(_tfmapper);
/*
* Construct transformation metadata (map-side) -- the logic is similar
* to GTFMTDMapper
*
* Note: The result of mapPartitionsWithIndex is cached so that the
* transformed data is not redundantly computed multiple times
*/
JavaPairRDD<Long, String> applyRDD = inputRDD.mapPartitionsWithIndex(new ApplyTfCSVMap(bcast_tf), true).mapToPair(new PairFunction<String, Long, String>() {
private static final long serialVersionUID = 3868143093999082931L;
@Override
public Tuple2<Long, String> call(String t) throws Exception {
return new Tuple2<Long, String>(new Long(1), t);
}
}).cache();
/*
* An action to force execution of apply()
*
* We need to trigger the execution of this RDD so as to ensure the
* creation of a few metadata files (headers, dummycoded information,
* etc.), which are referenced in the caller function.
*/
applyRDD.count();
return applyRDD;
}
Aggregations