Search in sources :

Example 1 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project learning-spark by databricks.

the class LogAnalyzerAppMain method main.

public static void main(String[] args) throws IOException {
    Flags.setFromCommandLineArgs(THE_OPTIONS, args);
    // Startup the Spark Conf.
    SparkConf conf = new SparkConf().setAppName("A Databricks Reference Application: Logs Analysis with Spark");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Flags.getInstance().getSlideInterval());
    // Checkpointing must be enabled to use the updateStateByKey function & windowed operations.
    jssc.checkpoint(Flags.getInstance().getCheckpointDirectory());
    // This methods monitors a directory for new files to read in for streaming.
    JavaDStream<String> logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory());
    JavaDStream<ApacheAccessLog> accessLogsDStream = logData.map(new Functions.ParseFromLogLine()).cache();
    final LogAnalyzerTotal logAnalyzerTotal = new LogAnalyzerTotal();
    final LogAnalyzerWindowed logAnalyzerWindowed = new LogAnalyzerWindowed();
    // Process the DStream which gathers stats for all of time.
    logAnalyzerTotal.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
    // Calculate statistics for the last time interval.
    logAnalyzerWindowed.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
    // Render the output each time there is a new RDD in the accessLogsDStream.
    final Renderer renderer = new Renderer();
    accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> rdd) {
            // Call this to output the stats.
            try {
                renderer.render(logAnalyzerTotal.getLogStatistics(), logAnalyzerWindowed.getLogStatistics());
            } catch (Exception e) {
            }
            return null;
        }
    });
    // Start the streaming server.
    // Start the computation
    jssc.start();
    // Wait for the computation to terminate
    jssc.awaitTermination();
}
Also used : IOException(java.io.IOException) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) SparkConf(org.apache.spark.SparkConf)

Example 2 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project learning-spark by databricks.

the class LogAnalyzerTotal method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    // Calculate statistics based on the content size, and update the static variables to track this.
    accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
            if (stats != null) {
                runningCount.getAndAdd(stats._1());
                runningSum.getAndAdd(stats._2());
                runningMin.set(Math.min(runningMin.get(), stats._3()));
                runningMax.set(Math.max(runningMax.get(), stats._4()));
            }
            return null;
        }
    });
    // A DStream of Resonse Code Counts;
    JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.responseCodeCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {

        public Void call(JavaPairRDD<Integer, Long> rdd) {
            currentResponseCodeCounts = rdd.take(100);
            return null;
        }
    });
    // A DStream of ipAddressCounts.
    JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.ipAddressCount(rdd);
        }
    });
    JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
    // A DStream of ipAddressCounts without transform
    JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
    JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
    // and joining it with the transfer amount
    JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
    JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
    JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
    // Save our dstream of ip address request counts
    JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {

        public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
            return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
        }
    });
    class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
    }
    ;
    writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
    // All ips more than 10
    JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {

        public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
            return Functions.filterIPAddress(rdd);
        }
    });
    ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {

        public Void call(JavaRDD<String> rdd) {
            List<String> currentIPAddresses = rdd.take(100);
            return null;
        }
    });
    // A DStream of endpoint to count.
    JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.endpointCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    Object ordering = Ordering.natural();
    final Comparator<Long> cmp = (Comparator<Long>) ordering;
    endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {

        public Void call(JavaPairRDD<String, Long> rdd) {
            currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
            return null;
        }
    });
}
Also used : SequenceFileOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat) Comparator(java.util.Comparator) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong)

Example 3 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project learning-spark by databricks.

the class LogAnalyzerWindowed method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {

        public String call(ApacheAccessLog entry) {
            return entry.getIpAddress();
        }
    });
    // reduceByWindow
    JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {

        public Long call(ApacheAccessLog entry) {
            return 1L;
        }
    }).reduceByWindow(new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCountRBW.print();
    // reducebykeyandwindow
    JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {

        public Tuple2<String, Long> call(ApacheAccessLog entry) {
            return new Tuple2(entry.getIpAddress(), 1L);
        }
    });
    JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, // Removing elements from the oldest slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    ipCountDStream.print();
    // Use countByWindow
    JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCount.print();
    ipAddressRequestCount.print();
    // use a transform for the response code count
    JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
            return Functions.responseCodeCount(logs);
        }
    });
    windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
            List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
            JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
            List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
            Object ordering = Ordering.natural();
            Comparator<Long> cmp = (Comparator<Long>) ordering;
            List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
            logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
            return null;
        }
    });
}
Also used : Comparator(java.util.Comparator) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2)

Example 4 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project camel by apache.

the class DataFrameSparkProducer method collectResults.

// Helpers
protected void collectResults(Exchange exchange, Object result) {
    if (result instanceof JavaRDD) {
        JavaRDD rddResults = (JavaRDD) result;
        if (getEndpoint().isCollect()) {
            exchange.getIn().setBody(rddResults.collect());
        } else {
            exchange.getIn().setBody(result);
            exchange.getIn().setHeader(SPARK_RDD_HEADER, result);
        }
    } else {
        exchange.getIn().setBody(result);
    }
}
Also used : JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 5 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project camel by apache.

the class RddSparkProducer method collectResults.

// Helpers
protected void collectResults(Exchange exchange, Object result) {
    if (result instanceof JavaRDD) {
        JavaRDD rddResults = (JavaRDD) result;
        if (getEndpoint().isCollect()) {
            exchange.getIn().setBody(rddResults.collect());
        } else {
            exchange.getIn().setBody(result);
            exchange.getIn().setHeader(SPARK_RDD_HEADER, result);
        }
    } else {
        exchange.getIn().setBody(result);
    }
}
Also used : JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

JavaRDD (org.apache.spark.api.java.JavaRDD)56 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)32 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)24 List (java.util.List)22 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)20 Collectors (java.util.stream.Collectors)19 Tuple2 (scala.Tuple2)19 Argument (org.broadinstitute.barclay.argparser.Argument)17 Broadcast (org.apache.spark.broadcast.Broadcast)15 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)15 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)14 UserException (org.broadinstitute.hellbender.exceptions.UserException)14 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)13 GATKSparkTool (org.broadinstitute.hellbender.engine.spark.GATKSparkTool)13 IOException (java.io.IOException)12 Serializable (java.io.Serializable)12 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)12 java.util (java.util)11 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)11