use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.
the class SortReadFileSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
JavaRDD<GATKRead> reads = getReads();
int numReducers = getRecommendedNumReducers();
logger.info("Using %s reducers" + numReducers);
final SAMFileHeader readsHeader = getHeaderForReads();
ReadCoordinateComparator comparator = new ReadCoordinateComparator(readsHeader);
JavaRDD<GATKRead> sortedReads;
if (shardedOutput) {
sortedReads = reads.mapToPair(read -> new Tuple2<>(read, null)).sortByKey(comparator, true, numReducers).keys();
} else {
// sorting is done by writeReads below
sortedReads = reads;
}
readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
writeReads(ctx, outputFile, sortedReads);
}
use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.
the class QualityYieldMetricsCollectorSpark method collectMetrics.
/**
* Do the actual metrics collection on the provided RDD.
* @param filteredReads The reads to be analyzed for this collector.
* @param samHeader The SAMFileHeader associated with the reads in the input RDD.
*/
@Override
public void collectMetrics(final JavaRDD<GATKRead> filteredReads, final SAMFileHeader samHeader) {
final QualityYieldMetrics metrics = filteredReads.aggregate(new QualityYieldMetrics().setUseOriginalQualities(args.useOriginalQualities), (hgp, read) -> hgp.addRead(read), (hgp1, hgp2) -> hgp1.combine(hgp2)).finish();
metricsFile.addMetric(metrics);
}
use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.
the class AddContextDataToReadSpark method addUsingOverlapsPartitioning.
/**
* Add context data ({@link ReadContextData}) to reads, using overlaps partitioning to avoid a shuffle.
* @param ctx the Spark context
* @param mappedReads the coordinate-sorted reads
* @param referenceSource the reference source
* @param variants the coordinate-sorted variants
* @param sequenceDictionary the sequence dictionary for the reads
* @param shardSize the maximum size of each shard, in bases
* @param shardPadding amount of extra context around each shard, in bases
* @return a RDD of read-context pairs, in coordinate-sorted order
*/
private static JavaPairRDD<GATKRead, ReadContextData> addUsingOverlapsPartitioning(final JavaSparkContext ctx, final JavaRDD<GATKRead> mappedReads, final ReferenceMultiSource referenceSource, final JavaRDD<GATKVariant> variants, final SAMSequenceDictionary sequenceDictionary, final int shardSize, final int shardPadding) {
final List<SimpleInterval> intervals = IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
// use unpadded shards (padding is only needed for reference bases)
final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
final Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceSource);
final IntervalsSkipList<GATKVariant> variantSkipList = new IntervalsSkipList<>(variants.collect());
final Broadcast<IntervalsSkipList<GATKVariant>> variantsBroadcast = ctx.broadcast(variantSkipList);
int maxLocatableSize = Math.min(shardSize, shardPadding);
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, mappedReads, GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize);
return shardedReads.flatMapToPair(new PairFlatMapFunction<Shard<GATKRead>, GATKRead, ReadContextData>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<Tuple2<GATKRead, ReadContextData>> call(Shard<GATKRead> shard) throws Exception {
// get reference bases for this shard (padded)
SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(shardPadding, sequenceDictionary);
ReferenceBases referenceBases = bReferenceSource.getValue().getReferenceBases(null, paddedInterval);
final IntervalsSkipList<GATKVariant> intervalsSkipList = variantsBroadcast.getValue();
Iterator<Tuple2<GATKRead, ReadContextData>> transform = Iterators.transform(shard.iterator(), new Function<GATKRead, Tuple2<GATKRead, ReadContextData>>() {
@Nullable
@Override
public Tuple2<GATKRead, ReadContextData> apply(@Nullable GATKRead r) {
List<GATKVariant> overlappingVariants;
if (SimpleInterval.isValid(r.getContig(), r.getStart(), r.getEnd())) {
overlappingVariants = intervalsSkipList.getOverlapping(new SimpleInterval(r));
} else {
//Sometimes we have reads that do not form valid intervals (reads that do not consume any ref bases, eg CIGAR 61S90I
//In those cases, we'll just say that nothing overlaps the read
overlappingVariants = Collections.emptyList();
}
return new Tuple2<>(r, new ReadContextData(referenceBases, overlappingVariants));
}
});
// only include reads that start in the shard
return Iterators.filter(transform, r -> r._1().getStart() >= shard.getStart() && r._1().getStart() <= shard.getEnd());
}
});
}
use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.
the class IntervalWalkerSpark method getIntervals.
/**
* Loads intervals and the corresponding reads, reference and features into a {@link JavaRDD}.
*
* @return all intervals as a {@link JavaRDD}.
*/
public JavaRDD<IntervalWalkerContext> getIntervals(JavaSparkContext ctx) {
SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
// don't shard the intervals themselves, since we want each interval to be processed by a single task
final List<ShardBoundary> intervalShardBoundaries = getIntervals().stream().map(i -> new ShardBoundary(i, i)).collect(Collectors.toList());
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShardBoundaries, Integer.MAX_VALUE, shuffle);
Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
return shardedReads.map(getIntervalsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, intervalShardPadding));
}
use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.
the class CompareDuplicatesSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
JavaRDD<GATKRead> firstReads = filteredReads(getReads(), readArguments.getReadFilesNames().get(0));
ReadsSparkSource readsSource2 = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency());
JavaRDD<GATKRead> secondReads = filteredReads(readsSource2.getParallelReads(input2, null, getIntervals(), bamPartitionSplitSize), input2);
// Start by verifying that we have same number of reads and duplicates in each BAM.
long firstBamSize = firstReads.count();
long secondBamSize = secondReads.count();
if (firstBamSize != secondBamSize) {
throw new UserException("input bams have different numbers of mapped reads: " + firstBamSize + "," + secondBamSize);
}
System.out.println("processing bams with " + firstBamSize + " mapped reads");
long firstDupesCount = firstReads.filter(GATKRead::isDuplicate).count();
long secondDupesCount = secondReads.filter(GATKRead::isDuplicate).count();
if (firstDupesCount != secondDupesCount) {
System.out.println("BAMs have different number of total duplicates: " + firstDupesCount + "," + secondDupesCount);
}
System.out.println("first and second: " + firstDupesCount + "," + secondDupesCount);
Broadcast<SAMFileHeader> bHeader = ctx.broadcast(getHeaderForReads());
// Group the reads of each BAM by MarkDuplicates key, then pair up the the reads for each BAM.
JavaPairRDD<String, GATKRead> firstKeyed = firstReads.mapToPair(read -> new Tuple2<>(ReadsKey.keyForFragment(bHeader.getValue(), read), read));
JavaPairRDD<String, GATKRead> secondKeyed = secondReads.mapToPair(read -> new Tuple2<>(ReadsKey.keyForFragment(bHeader.getValue(), read), read));
JavaPairRDD<String, Tuple2<Iterable<GATKRead>, Iterable<GATKRead>>> cogroup = firstKeyed.cogroup(secondKeyed, getRecommendedNumReducers());
// Produces an RDD of MatchTypes, e.g., EQUAL, DIFFERENT_REPRESENTATIVE_READ, etc. per MarkDuplicates key,
// which is approximately start position x strand.
JavaRDD<MatchType> tagged = cogroup.map(v1 -> {
SAMFileHeader header = bHeader.getValue();
Iterable<GATKRead> iFirstReads = v1._2()._1();
Iterable<GATKRead> iSecondReads = v1._2()._2();
return getDupes(iFirstReads, iSecondReads, header);
});
// TODO: We should also produce examples of reads that don't match to make debugging easier (#1263).
Map<MatchType, Integer> tagCountMap = tagged.mapToPair(v1 -> new Tuple2<>(v1, 1)).reduceByKey((v1, v2) -> v1 + v2).collectAsMap();
if (tagCountMap.get(MatchType.SIZE_UNEQUAL) != null) {
throw new UserException("The number of reads by the MarkDuplicates key were unequal, indicating that the BAMs are not the same");
}
if (tagCountMap.get(MatchType.READ_MISMATCH) != null) {
throw new UserException("The reads grouped by the MarkDuplicates key were not the same, indicating that the BAMs are not the same");
}
if (printSummary) {
MatchType[] values = MatchType.values();
Set<MatchType> matchTypes = Sets.newLinkedHashSet(Sets.newHashSet(values));
System.out.println("##############################");
matchTypes.forEach(s -> System.out.println(s + ": " + tagCountMap.getOrDefault(s, 0)));
}
if (throwOnDiff) {
for (MatchType s : MatchType.values()) {
if (s != MatchType.EQUAL) {
if (tagCountMap.get(s) != null)
throw new UserException("found difference between the two BAMs: " + s + " with count " + tagCountMap.get(s));
}
}
}
}
Aggregations