use of org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method processRefRDD.
/**
* Do a map/reduce on an RDD of genomic sequences:
* Kmerize, mapping to a pair <kmer,1>, reduce by summing values by key, filter out <kmer,N> where
* N <= MAX_KMER_FREQ, and collect the high frequency kmers back in the driver.
*/
@VisibleForTesting
static List<SVKmer> processRefRDD(final int kSize, final int maxDUSTScore, final int maxKmerFreq, final JavaRDD<byte[]> refRDD) {
final int nPartitions = refRDD.getNumPartitions();
final int hashSize = 2 * REF_RECORDS_PER_PARTITION;
final int arrayCap = REF_RECORDS_PER_PARTITION / 100;
return refRDD.mapPartitions(seqItr -> {
final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
while (seqItr.hasNext()) {
final byte[] seq = seqItr.next();
SVDUSTFilteredKmerizer.stream(seq, kSize, maxDUSTScore, new SVKmerLong()).map(kmer -> kmer.canonical(kSize)).forEach(kmer -> {
final KmerAndCount entry = kmerCounts.find(kmer);
if (entry == null)
kmerCounts.add(new KmerAndCount((SVKmerLong) kmer));
else
entry.bumpCount();
});
}
return kmerCounts.iterator();
}).mapToPair(entry -> new Tuple2<>(entry.getKey(), entry.getValue())).partitionBy(new HashPartitioner(nPartitions)).mapPartitions(pairItr -> {
final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
while (pairItr.hasNext()) {
final Tuple2<SVKmer, Integer> pair = pairItr.next();
final SVKmer kmer = pair._1();
final int count = pair._2();
KmerAndCount entry = kmerCounts.find(kmer);
if (entry == null)
kmerCounts.add(new KmerAndCount((SVKmerLong) kmer, count));
else
entry.bumpCount(count);
}
final List<SVKmer> highFreqKmers = new ArrayList<>(arrayCap);
for (KmerAndCount kmerAndCount : kmerCounts) {
if (kmerAndCount.grabCount() > maxKmerFreq)
highFreqKmers.add(kmerAndCount.getKey());
}
return highFreqKmers.iterator();
}).collect();
}
Aggregations