Search in sources :

Example 1 with Tuple4

use of scala.Tuple4 in project learning-spark by databricks.

the class Functions method contentSizeStats.

@Nullable
public static final Tuple4<Long, Long, Long, Long> contentSizeStats(JavaRDD<ApacheAccessLog> accessLogRDD) {
    JavaDoubleRDD contentSizes = accessLogRDD.mapToDouble(new GetContentSize()).cache();
    long count = contentSizes.count();
    if (count == 0) {
        return null;
    }
    Object ordering = Ordering.natural();
    final Comparator<Double> cmp = (Comparator<Double>) ordering;
    return new Tuple4<>(count, contentSizes.reduce(new SumReducer()).longValue(), contentSizes.min(cmp).longValue(), contentSizes.max(cmp).longValue());
}
Also used : Tuple4(scala.Tuple4) JavaDoubleRDD(org.apache.spark.api.java.JavaDoubleRDD) Comparator(java.util.Comparator) Nullable(javax.annotation.Nullable)

Example 2 with Tuple4

use of scala.Tuple4 in project learning-spark by databricks.

the class LogAnalyzerTotal method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    // Calculate statistics based on the content size, and update the static variables to track this.
    accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
            if (stats != null) {
                runningCount.getAndAdd(stats._1());
                runningSum.getAndAdd(stats._2());
                runningMin.set(Math.min(runningMin.get(), stats._3()));
                runningMax.set(Math.max(runningMax.get(), stats._4()));
            }
            return null;
        }
    });
    // A DStream of Resonse Code Counts;
    JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.responseCodeCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {

        public Void call(JavaPairRDD<Integer, Long> rdd) {
            currentResponseCodeCounts = rdd.take(100);
            return null;
        }
    });
    // A DStream of ipAddressCounts.
    JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.ipAddressCount(rdd);
        }
    });
    JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
    // A DStream of ipAddressCounts without transform
    JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
    JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
    // and joining it with the transfer amount
    JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
    JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
    JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
    // Save our dstream of ip address request counts
    JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {

        public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
            return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
        }
    });
    class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
    }
    ;
    writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
    // All ips more than 10
    JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {

        public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
            return Functions.filterIPAddress(rdd);
        }
    });
    ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {

        public Void call(JavaRDD<String> rdd) {
            List<String> currentIPAddresses = rdd.take(100);
            return null;
        }
    });
    // A DStream of endpoint to count.
    JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.endpointCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    Object ordering = Ordering.natural();
    final Comparator<Long> cmp = (Comparator<Long>) ordering;
    endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {

        public Void call(JavaPairRDD<String, Long> rdd) {
            currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
            return null;
        }
    });
}
Also used : SequenceFileOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat) Comparator(java.util.Comparator) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong)

Example 3 with Tuple4

use of scala.Tuple4 in project learning-spark by databricks.

the class LogAnalyzerWindowed method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {

        public String call(ApacheAccessLog entry) {
            return entry.getIpAddress();
        }
    });
    // reduceByWindow
    JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {

        public Long call(ApacheAccessLog entry) {
            return 1L;
        }
    }).reduceByWindow(new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCountRBW.print();
    // reducebykeyandwindow
    JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {

        public Tuple2<String, Long> call(ApacheAccessLog entry) {
            return new Tuple2(entry.getIpAddress(), 1L);
        }
    });
    JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, // Removing elements from the oldest slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    ipCountDStream.print();
    // Use countByWindow
    JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCount.print();
    ipAddressRequestCount.print();
    // use a transform for the response code count
    JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
            return Functions.responseCodeCount(logs);
        }
    });
    windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
            List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
            JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
            List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
            Object ordering = Ordering.natural();
            Comparator<Long> cmp = (Comparator<Long>) ordering;
            List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
            logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
            return null;
        }
    });
}
Also used : Comparator(java.util.Comparator) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2)

Example 4 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleInversionWithHomology.

/**
     * The following four tests are all going to be for the same inversion, testing if implementations are correct for
     * identifying the breakpoints by looking at different representations of evidence.
     * The inversion we are looking at is
     *
     * '+' strand representation: G....100....G|ACACA|C....100....C               A....100....A|TGTGT|T....100....T
     *
     * 100-bases of 'G' is the left flanking before the homologyForwardStrandRep |ACACA| and the region starting with 100-bases of 'C' and
     * ending with 100-bases of 'A' and maybe (homologyForwardStrandRep uncertainty) the homologyForwardStrandRep |TGTGT| is inverted.
     * 100-bases of 'T' is the right flanking region.
     *
     * Returns a list of four Tuple5's with left flanking evidence '+'/'-' strand representation and right flanking side.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleInversionWithHomology(final ByteArrayOutputStream outputStream) throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    final byte[] leftLeftPlus = makeDummySequence(100, (byte) 'G');
    final byte[] leftLeftMinus = makeDummySequence(100, (byte) 'C');
    final byte[] leftRightPlus = makeDummySequence(100, (byte) 'C');
    final byte[] leftRightMinus = makeDummySequence(100, (byte) 'G');
    final byte[] rightLeftPlus = makeDummySequence(100, (byte) 'A');
    final byte[] rightLeftMinus = makeDummySequence(100, (byte) 'T');
    final byte[] rightRightPlus = makeDummySequence(100, (byte) 'T');
    final byte[] rightRightMinus = makeDummySequence(100, (byte) 'A');
    final byte[] leftHomology = "ACACA".getBytes();
    final byte[] rightHomology = "TGTGT".getBytes();
    {
        // left flanking evidence '+'/'-' strand representation
        outputStream.reset();
        outputStream.write(leftLeftPlus);
        outputStream.write(leftHomology);
        outputStream.write(rightLeftMinus);
        byte[] contigSeq = outputStream.toByteArray();
        AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 101, 205), 1, 105, TextCigarCodec.decode("105M100S"), true, 60, 0);
        AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 501, 605), 101, 205, TextCigarCodec.decode("100S105M"), false, 60, 0);
        final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
        result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
        outputStream.reset();
        outputStream.write(rightLeftPlus);
        outputStream.write(rightHomology);
        outputStream.write(leftLeftMinus);
        contigSeq = outputStream.toByteArray();
        region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 501, 605), 1, 105, TextCigarCodec.decode("105M100S"), true, 60, 0);
        region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 101, 205), 101, 205, TextCigarCodec.decode("100S105M"), false, 60, 0);
        final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
        result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    }
    {
        // right flanking evidence '+'/'-' strand representation
        outputStream.reset();
        outputStream.write(leftRightMinus);
        outputStream.write(rightHomology);
        outputStream.write(rightRightPlus);
        byte[] contigSeq = outputStream.toByteArray();
        AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 201, 305), 1, 105, TextCigarCodec.decode("105M100S"), false, 60, 0);
        AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 601, 705), 101, 205, TextCigarCodec.decode("100S105M"), true, 60, 0);
        final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
        result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
        outputStream.reset();
        outputStream.write(rightRightMinus);
        outputStream.write(leftHomology);
        outputStream.write(leftRightPlus);
        contigSeq = outputStream.toByteArray();
        region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 601, 705), 1, 105, TextCigarCodec.decode("105M100S"), false, 60, 0);
        region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 201, 305), 101, 205, TextCigarCodec.decode("100S105M"), true, 60, 0);
        final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
        result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    }
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 5 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleTandemDuplicationExpansionWithNovelInsertion.

/**
     * System.out.println(new String(reference.getReferenceBases(dummyOptions, new SimpleInterval("21", 25297100, 25297300)).getBases()));
     * leftFlank:  chr21:25297101-25297163
     * repeat:     chr21:25297164-25297252
     * rightFlank: chr21:25297253-25297300
     * GTTAGTAGATATTCTAGCTGACTCAGTTCAGTGTTGCTATGATTAAACAAGAGTGAGTTCCCT
     * AAAAGTAAATGTTATAAGAAATCTTAAGTATTATTTTCTTATGTTTCTAGCCTAATAAAGTGCTTTTATTAAAGCACTTTATTTAAAGG
     * CATTATTGATATTTCATTATGTTCAACAGATGGAGTTAATGTGAATGT
     *
     * insertedSequenceForwardStrandRep: CTCTCTCTCT
     *
     * Return a list of two entries for positive and reverse strand representations.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleTandemDuplicationExpansionWithNovelInsertion(final ByteArrayOutputStream outputStream) throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // simple tandem duplication expansion with novel insertion '+' strand representation
    //63
    final byte[] leftRefFlank = "GTTAGTAGATATTCTAGCTGACTCAGTTCAGTGTTGCTATGATTAAACAAGAGTGAGTTCCCT".getBytes();
    //48
    final byte[] rightRefFlank = "CATTATTGATATTTCATTATGTTCAACAGATGGAGTTAATGTGAATGT".getBytes();
    //10
    final byte[] insertedSeq = "CTCTCTCTCT".getBytes();
    //89
    final byte[] dup = "AAAAGTAAATGTTATAAGAAATCTTAAGTATTATTTTCTTATGTTTCTAGCCTAATAAAGTGCTTTTATTAAAGCACTTTATTTAAAGG".getBytes();
    outputStream.reset();
    outputStream.write(leftRefFlank);
    outputStream.write(dup);
    outputStream.write(insertedSeq);
    outputStream.write(dup);
    outputStream.write(rightRefFlank);
    byte[] contigSeq = outputStream.toByteArray();
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297101, 25297252), 1, 152, TextCigarCodec.decode("152M147S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297164, 25297300), 163, 299, TextCigarCodec.decode("162S137M"), true, 60, 0);
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // simple tandem duplication expansion with novel insertion '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    SequenceUtil.reverseComplement(insertedSeq);
    SequenceUtil.reverseComplement(dup);
    outputStream.reset();
    outputStream.write(rightRefFlank);
    outputStream.write(dup);
    outputStream.write(insertedSeq);
    outputStream.write(dup);
    outputStream.write(leftRefFlank);
    contigSeq = outputStream.toByteArray();
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297164, 25297300), 1, 137, TextCigarCodec.decode("137M162S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297101, 25297252), 148, 299, TextCigarCodec.decode("147S152M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

Tuple4 (scala.Tuple4)14 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)11 ArrayList (java.util.ArrayList)9 Comparator (java.util.Comparator)3 List (java.util.List)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 Function (org.apache.spark.api.java.function.Function)2 PairFunction (org.apache.spark.api.java.function.PairFunction)2 Tuple2 (scala.Tuple2)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Nullable (javax.annotation.Nullable)1 LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 SequenceFileOutputFormat (org.apache.hadoop.mapred.SequenceFileOutputFormat)1 JavaDoubleRDD (org.apache.spark.api.java.JavaDoubleRDD)1 VoidFunction (org.apache.spark.api.java.function.VoidFunction)1