Search in sources :

Example 1 with MapPartitioner

use of org.broadinstitute.hellbender.tools.spark.utils.MapPartitioner in project gatk by broadinstitute.

the class FindBreakpointEvidenceSpark method getKmerIntervals.

/** find kmers for each interval */
@VisibleForTesting
static Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> getKmerIntervals(final Params params, final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final int nIntervals, final Set<SVKmer> kmerKillSet, final JavaRDD<GATKRead> reads, final Locations locations) {
    final Broadcast<Set<SVKmer>> broadcastKmerKillSet = ctx.broadcast(kmerKillSet);
    final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNameAndIntervalsMultiMap = ctx.broadcast(qNamesMultiMap);
    // given a set of template names with interval IDs and a kill set of ubiquitous kmers,
    // produce a set of interesting kmers for each interval ID
    final int kmersPerPartitionGuess = params.cleanerKmersPerPartitionGuess;
    final int minKmers = params.cleanerMinKmerCount;
    final int maxKmers = params.cleanerMaxKmerCount;
    final int maxIntervals = params.cleanerMaxIntervals;
    final int kSize = params.kSize;
    final int maxDUSTScore = params.maxDUSTScore;
    final List<KmerAndInterval> kmerIntervals = reads.mapPartitionsToPair(readItr -> new MapPartitioner<>(readItr, new QNameKmerizer(broadcastQNameAndIntervalsMultiMap.value(), broadcastKmerKillSet.value(), kSize, maxDUSTScore)).iterator(), false).reduceByKey(Integer::sum).mapPartitions(itr -> new KmerCleaner(itr, kmersPerPartitionGuess, minKmers, maxKmers, maxIntervals).iterator()).collect();
    broadcastQNameAndIntervalsMultiMap.destroy();
    broadcastKmerKillSet.destroy();
    final int[] intervalKmerCounts = new int[nIntervals];
    for (final KmerAndInterval kmerAndInterval : kmerIntervals) {
        intervalKmerCounts[kmerAndInterval.getIntervalId()] += 1;
    }
    final Set<Integer> intervalsToKill = new HashSet<>();
    final List<AlignedAssemblyOrExcuse> intervalDispositions = new ArrayList<>();
    for (int idx = 0; idx != nIntervals; ++idx) {
        if (intervalKmerCounts[idx] < params.minKmersPerInterval) {
            intervalsToKill.add(idx);
            intervalDispositions.add(new AlignedAssemblyOrExcuse(idx, "FASTQ not written -- too few kmers"));
        }
    }
    qNamesMultiMap.removeIf(qNameAndInterval -> intervalsToKill.contains(qNameAndInterval.getIntervalId()));
    final List<KmerAndInterval> filteredKmerIntervals = kmerIntervals.stream().filter(kmerAndInterval -> !intervalsToKill.contains(kmerAndInterval.getIntervalId())).collect(SVUtils.arrayListCollector(kmerIntervals.size()));
    // record the kmers with their interval IDs
    if (locations.kmerFile != null) {
        try (final OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(BucketUtils.createFile(locations.kmerFile)))) {
            for (final KmerAndInterval kmerAndInterval : filteredKmerIntervals) {
                writer.write(kmerAndInterval.toString(kSize) + " " + kmerAndInterval.getIntervalId() + "\n");
            }
        } catch (final IOException ioe) {
            throw new GATKException("Can't write kmer intervals file " + locations.kmerFile, ioe);
        }
    }
    return new Tuple2<>(intervalDispositions, filteredKmerIntervals);
}
Also used : BwaMemIndexSingleton(org.broadinstitute.hellbender.utils.bwa.BwaMemIndexSingleton) IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ArgumentCollection(org.broadinstitute.barclay.argparser.ArgumentCollection) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) BwaMemAlignment(org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment) Function(java.util.function.Function) BwaMemAligner(org.broadinstitute.hellbender.utils.bwa.BwaMemAligner) HopscotchUniqueMultiMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap) FermiLiteAssembly(org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembly) FermiLiteAssembler(org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembler) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) Broadcast(org.apache.spark.broadcast.Broadcast) HashPartitioner(org.apache.spark.HashPartitioner) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StructuralVariationSparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariationSparkProgramGroup) FindBreakpointEvidenceSparkArgumentCollection(org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.FindBreakpointEvidenceSparkArgumentCollection) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) Utils(org.broadinstitute.hellbender.utils.Utils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) htsjdk.samtools(htsjdk.samtools) MapPartitioner(org.broadinstitute.hellbender.tools.spark.utils.MapPartitioner) LogManager(org.apache.logging.log4j.LogManager) HopscotchUniqueMultiMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap) Tuple2(scala.Tuple2) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 htsjdk.samtools (htsjdk.samtools)1 java.io (java.io)1 java.util (java.util)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 IntStream (java.util.stream.IntStream)1 LogManager (org.apache.logging.log4j.LogManager)1 Logger (org.apache.logging.log4j.Logger)1 HashPartitioner (org.apache.spark.HashPartitioner)1 JavaRDD (org.apache.spark.api.java.JavaRDD)1 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1 Broadcast (org.apache.spark.broadcast.Broadcast)1 Argument (org.broadinstitute.barclay.argparser.Argument)1 ArgumentCollection (org.broadinstitute.barclay.argparser.ArgumentCollection)1 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)1 StandardArgumentDefinitions (org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions)1 StructuralVariationSparkProgramGroup (org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariationSparkProgramGroup)1 GATKSparkTool (org.broadinstitute.hellbender.engine.spark.GATKSparkTool)1 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)1