use of org.apache.hadoop.mapred.SequenceFileOutputFormat in project learning-spark by databricks.
the class LogAnalyzerTotal method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
// Calculate statistics based on the content size, and update the static variables to track this.
accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
if (stats != null) {
runningCount.getAndAdd(stats._1());
runningSum.getAndAdd(stats._2());
runningMin.set(Math.min(runningMin.get(), stats._3()));
runningMax.set(Math.max(runningMax.get(), stats._4()));
}
return null;
}
});
// A DStream of Resonse Code Counts;
JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.responseCodeCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {
public Void call(JavaPairRDD<Integer, Long> rdd) {
currentResponseCodeCounts = rdd.take(100);
return null;
}
});
// A DStream of ipAddressCounts.
JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.ipAddressCount(rdd);
}
});
JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
// A DStream of ipAddressCounts without transform
JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
// and joining it with the transfer amount
JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
// Save our dstream of ip address request counts
JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {
public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
}
});
class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
}
;
writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
// All ips more than 10
JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {
public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
return Functions.filterIPAddress(rdd);
}
});
ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {
public Void call(JavaRDD<String> rdd) {
List<String> currentIPAddresses = rdd.take(100);
return null;
}
});
// A DStream of endpoint to count.
JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.endpointCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
Object ordering = Ordering.natural();
final Comparator<Long> cmp = (Comparator<Long>) ordering;
endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {
public Void call(JavaPairRDD<String, Long> rdd) {
currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
return null;
}
});
}
Aggregations