use of org.broadinstitute.hellbender.tools.spark.utils.MapPartitioner in project gatk by broadinstitute.
the class FindBreakpointEvidenceSpark method getKmerIntervals.
/** find kmers for each interval */
@VisibleForTesting
static Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> getKmerIntervals(final Params params, final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final int nIntervals, final Set<SVKmer> kmerKillSet, final JavaRDD<GATKRead> reads, final Locations locations) {
final Broadcast<Set<SVKmer>> broadcastKmerKillSet = ctx.broadcast(kmerKillSet);
final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNameAndIntervalsMultiMap = ctx.broadcast(qNamesMultiMap);
// given a set of template names with interval IDs and a kill set of ubiquitous kmers,
// produce a set of interesting kmers for each interval ID
final int kmersPerPartitionGuess = params.cleanerKmersPerPartitionGuess;
final int minKmers = params.cleanerMinKmerCount;
final int maxKmers = params.cleanerMaxKmerCount;
final int maxIntervals = params.cleanerMaxIntervals;
final int kSize = params.kSize;
final int maxDUSTScore = params.maxDUSTScore;
final List<KmerAndInterval> kmerIntervals = reads.mapPartitionsToPair(readItr -> new MapPartitioner<>(readItr, new QNameKmerizer(broadcastQNameAndIntervalsMultiMap.value(), broadcastKmerKillSet.value(), kSize, maxDUSTScore)).iterator(), false).reduceByKey(Integer::sum).mapPartitions(itr -> new KmerCleaner(itr, kmersPerPartitionGuess, minKmers, maxKmers, maxIntervals).iterator()).collect();
broadcastQNameAndIntervalsMultiMap.destroy();
broadcastKmerKillSet.destroy();
final int[] intervalKmerCounts = new int[nIntervals];
for (final KmerAndInterval kmerAndInterval : kmerIntervals) {
intervalKmerCounts[kmerAndInterval.getIntervalId()] += 1;
}
final Set<Integer> intervalsToKill = new HashSet<>();
final List<AlignedAssemblyOrExcuse> intervalDispositions = new ArrayList<>();
for (int idx = 0; idx != nIntervals; ++idx) {
if (intervalKmerCounts[idx] < params.minKmersPerInterval) {
intervalsToKill.add(idx);
intervalDispositions.add(new AlignedAssemblyOrExcuse(idx, "FASTQ not written -- too few kmers"));
}
}
qNamesMultiMap.removeIf(qNameAndInterval -> intervalsToKill.contains(qNameAndInterval.getIntervalId()));
final List<KmerAndInterval> filteredKmerIntervals = kmerIntervals.stream().filter(kmerAndInterval -> !intervalsToKill.contains(kmerAndInterval.getIntervalId())).collect(SVUtils.arrayListCollector(kmerIntervals.size()));
// record the kmers with their interval IDs
if (locations.kmerFile != null) {
try (final OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(BucketUtils.createFile(locations.kmerFile)))) {
for (final KmerAndInterval kmerAndInterval : filteredKmerIntervals) {
writer.write(kmerAndInterval.toString(kSize) + " " + kmerAndInterval.getIntervalId() + "\n");
}
} catch (final IOException ioe) {
throw new GATKException("Can't write kmer intervals file " + locations.kmerFile, ioe);
}
}
return new Tuple2<>(intervalDispositions, filteredKmerIntervals);
}
Aggregations