use of scala.Tuple4 in project learning-spark by databricks.
the class Functions method contentSizeStats.
@Nullable
public static final Tuple4<Long, Long, Long, Long> contentSizeStats(JavaRDD<ApacheAccessLog> accessLogRDD) {
JavaDoubleRDD contentSizes = accessLogRDD.mapToDouble(new GetContentSize()).cache();
long count = contentSizes.count();
if (count == 0) {
return null;
}
Object ordering = Ordering.natural();
final Comparator<Double> cmp = (Comparator<Double>) ordering;
return new Tuple4<>(count, contentSizes.reduce(new SumReducer()).longValue(), contentSizes.min(cmp).longValue(), contentSizes.max(cmp).longValue());
}
use of scala.Tuple4 in project learning-spark by databricks.
the class LogAnalyzerTotal method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
// Calculate statistics based on the content size, and update the static variables to track this.
accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
if (stats != null) {
runningCount.getAndAdd(stats._1());
runningSum.getAndAdd(stats._2());
runningMin.set(Math.min(runningMin.get(), stats._3()));
runningMax.set(Math.max(runningMax.get(), stats._4()));
}
return null;
}
});
// A DStream of Resonse Code Counts;
JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.responseCodeCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {
public Void call(JavaPairRDD<Integer, Long> rdd) {
currentResponseCodeCounts = rdd.take(100);
return null;
}
});
// A DStream of ipAddressCounts.
JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.ipAddressCount(rdd);
}
});
JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
// A DStream of ipAddressCounts without transform
JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
// and joining it with the transfer amount
JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
// Save our dstream of ip address request counts
JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {
public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
}
});
class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
}
;
writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
// All ips more than 10
JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {
public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
return Functions.filterIPAddress(rdd);
}
});
ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {
public Void call(JavaRDD<String> rdd) {
List<String> currentIPAddresses = rdd.take(100);
return null;
}
});
// A DStream of endpoint to count.
JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.endpointCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
Object ordering = Ordering.natural();
final Comparator<Long> cmp = (Comparator<Long>) ordering;
endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {
public Void call(JavaPairRDD<String, Long> rdd) {
currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
return null;
}
});
}
use of scala.Tuple4 in project learning-spark by databricks.
the class LogAnalyzerWindowed method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {
public String call(ApacheAccessLog entry) {
return entry.getIpAddress();
}
});
// reduceByWindow
JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {
public Long call(ApacheAccessLog entry) {
return 1L;
}
}).reduceByWindow(new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 + v2;
}
}, new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 - v2;
}
}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
requestCountRBW.print();
// reducebykeyandwindow
JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {
public Tuple2<String, Long> call(ApacheAccessLog entry) {
return new Tuple2(entry.getIpAddress(), 1L);
}
});
JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 + v2;
}
}, // Removing elements from the oldest slice
new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 - v2;
}
}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
ipCountDStream.print();
// Use countByWindow
JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
requestCount.print();
ipAddressRequestCount.print();
// use a transform for the response code count
JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
return Functions.responseCodeCount(logs);
}
});
windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
Object ordering = Ordering.natural();
Comparator<Long> cmp = (Comparator<Long>) ordering;
List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
return null;
}
});
}
use of scala.Tuple4 in project gatk by broadinstitute.
the class SVDiscoveryTestDataProvider method forSimpleInversionWithHomology.
/**
* The following four tests are all going to be for the same inversion, testing if implementations are correct for
* identifying the breakpoints by looking at different representations of evidence.
* The inversion we are looking at is
*
* '+' strand representation: G....100....G|ACACA|C....100....C A....100....A|TGTGT|T....100....T
*
* 100-bases of 'G' is the left flanking before the homologyForwardStrandRep |ACACA| and the region starting with 100-bases of 'C' and
* ending with 100-bases of 'A' and maybe (homologyForwardStrandRep uncertainty) the homologyForwardStrandRep |TGTGT| is inverted.
* 100-bases of 'T' is the right flanking region.
*
* Returns a list of four Tuple5's with left flanking evidence '+'/'-' strand representation and right flanking side.
*/
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleInversionWithHomology(final ByteArrayOutputStream outputStream) throws IOException {
final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
final byte[] leftLeftPlus = makeDummySequence(100, (byte) 'G');
final byte[] leftLeftMinus = makeDummySequence(100, (byte) 'C');
final byte[] leftRightPlus = makeDummySequence(100, (byte) 'C');
final byte[] leftRightMinus = makeDummySequence(100, (byte) 'G');
final byte[] rightLeftPlus = makeDummySequence(100, (byte) 'A');
final byte[] rightLeftMinus = makeDummySequence(100, (byte) 'T');
final byte[] rightRightPlus = makeDummySequence(100, (byte) 'T');
final byte[] rightRightMinus = makeDummySequence(100, (byte) 'A');
final byte[] leftHomology = "ACACA".getBytes();
final byte[] rightHomology = "TGTGT".getBytes();
{
// left flanking evidence '+'/'-' strand representation
outputStream.reset();
outputStream.write(leftLeftPlus);
outputStream.write(leftHomology);
outputStream.write(rightLeftMinus);
byte[] contigSeq = outputStream.toByteArray();
AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 101, 205), 1, 105, TextCigarCodec.decode("105M100S"), true, 60, 0);
AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 501, 605), 101, 205, TextCigarCodec.decode("100S105M"), false, 60, 0);
final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
outputStream.reset();
outputStream.write(rightLeftPlus);
outputStream.write(rightHomology);
outputStream.write(leftLeftMinus);
contigSeq = outputStream.toByteArray();
region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 501, 605), 1, 105, TextCigarCodec.decode("105M100S"), true, 60, 0);
region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 101, 205), 101, 205, TextCigarCodec.decode("100S105M"), false, 60, 0);
final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
}
{
// right flanking evidence '+'/'-' strand representation
outputStream.reset();
outputStream.write(leftRightMinus);
outputStream.write(rightHomology);
outputStream.write(rightRightPlus);
byte[] contigSeq = outputStream.toByteArray();
AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 201, 305), 1, 105, TextCigarCodec.decode("105M100S"), false, 60, 0);
AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 601, 705), 101, 205, TextCigarCodec.decode("100S105M"), true, 60, 0);
final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
outputStream.reset();
outputStream.write(rightRightMinus);
outputStream.write(leftHomology);
outputStream.write(leftRightPlus);
contigSeq = outputStream.toByteArray();
region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 601, 705), 1, 105, TextCigarCodec.decode("105M100S"), false, 60, 0);
region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 201, 305), 101, 205, TextCigarCodec.decode("100S105M"), true, 60, 0);
final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, new ArrayList<>(), "asm000001:tig00001"), contigSeq);
result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
}
return result;
}
use of scala.Tuple4 in project gatk by broadinstitute.
the class SVDiscoveryTestDataProvider method forSimpleTandemDuplicationExpansionWithNovelInsertion.
/**
* System.out.println(new String(reference.getReferenceBases(dummyOptions, new SimpleInterval("21", 25297100, 25297300)).getBases()));
* leftFlank: chr21:25297101-25297163
* repeat: chr21:25297164-25297252
* rightFlank: chr21:25297253-25297300
* GTTAGTAGATATTCTAGCTGACTCAGTTCAGTGTTGCTATGATTAAACAAGAGTGAGTTCCCT
* AAAAGTAAATGTTATAAGAAATCTTAAGTATTATTTTCTTATGTTTCTAGCCTAATAAAGTGCTTTTATTAAAGCACTTTATTTAAAGG
* CATTATTGATATTTCATTATGTTCAACAGATGGAGTTAATGTGAATGT
*
* insertedSequenceForwardStrandRep: CTCTCTCTCT
*
* Return a list of two entries for positive and reverse strand representations.
*/
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleTandemDuplicationExpansionWithNovelInsertion(final ByteArrayOutputStream outputStream) throws IOException {
final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
// simple tandem duplication expansion with novel insertion '+' strand representation
//63
final byte[] leftRefFlank = "GTTAGTAGATATTCTAGCTGACTCAGTTCAGTGTTGCTATGATTAAACAAGAGTGAGTTCCCT".getBytes();
//48
final byte[] rightRefFlank = "CATTATTGATATTTCATTATGTTCAACAGATGGAGTTAATGTGAATGT".getBytes();
//10
final byte[] insertedSeq = "CTCTCTCTCT".getBytes();
//89
final byte[] dup = "AAAAGTAAATGTTATAAGAAATCTTAAGTATTATTTTCTTATGTTTCTAGCCTAATAAAGTGCTTTTATTAAAGCACTTTATTTAAAGG".getBytes();
outputStream.reset();
outputStream.write(leftRefFlank);
outputStream.write(dup);
outputStream.write(insertedSeq);
outputStream.write(dup);
outputStream.write(rightRefFlank);
byte[] contigSeq = outputStream.toByteArray();
AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297101, 25297252), 1, 152, TextCigarCodec.decode("152M147S"), true, 60, 0);
AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297164, 25297300), 163, 299, TextCigarCodec.decode("162S137M"), true, 60, 0);
final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
// simple tandem duplication expansion with novel insertion '-' strand representation
SequenceUtil.reverseComplement(leftRefFlank);
SequenceUtil.reverseComplement(rightRefFlank);
SequenceUtil.reverseComplement(insertedSeq);
SequenceUtil.reverseComplement(dup);
outputStream.reset();
outputStream.write(rightRefFlank);
outputStream.write(dup);
outputStream.write(insertedSeq);
outputStream.write(dup);
outputStream.write(leftRefFlank);
contigSeq = outputStream.toByteArray();
region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297164, 25297300), 1, 137, TextCigarCodec.decode("137M162S"), false, 60, 0);
region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 25297101, 25297252), 148, 299, TextCigarCodec.decode("147S152M"), false, 60, 0);
final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
return result;
}
Aggregations