Examples with JavaRDD - org.apache.spark.api.java.JavaRDD

Example 16 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class LocusWalkerSpark method getAlignments.

/**
     * Loads alignments and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
     *
     * If no intervals were specified, returns all the alignments.
     *
     * @return all alignments as a {@link JavaRDD}, bounded by intervals if specified.
     */
public JavaRDD<LocusWalkerContext> getAlignments(JavaSparkContext ctx) {
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, readShardPadding, sequenceDictionary).stream()).collect(Collectors.toList());
    int maxLocatableSize = Math.min(readShardSize, readShardPadding);
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize, shuffle);
    Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedReads.flatMap(getAlignmentsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, getHeaderForReads(), getDownsamplingInfo()));
}

Also used : Broadcast(org.apache.spark.broadcast.Broadcast) java.util(java.util) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ImmutableList(com.google.common.collect.ImmutableList) StreamSupport(java.util.stream.StreamSupport) LIBSDownsamplingInfo(org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 17 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class VariantWalkerSpark method getVariants.

/**
     * Loads variants and the corresponding reads, reference and features into a {@link JavaRDD} for the intervals specified.
     * FOr the current implementation the reads context will always be empty.
     *
     * If no intervals were specified, returns all the variants.
     *
     * @return all variants as a {@link JavaRDD}, bounded by intervals if specified.
     */
public JavaRDD<VariantWalkerContext> getVariants(JavaSparkContext ctx) {
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    // use unpadded shards (padding is only needed for reference bases)
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, variantShardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
    JavaRDD<VariantContext> variants = variantsSource.getParallelVariantContexts(drivingVariantFile, getIntervals());
    VariantFilter variantFilter = makeVariantFilter();
    variants = variants.filter(variantFilter::test);
    JavaRDD<Shard<VariantContext>> shardedVariants = SparkSharder.shard(ctx, variants, VariantContext.class, sequenceDictionary, intervalShards, variantShardSize, shuffle);
    Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedVariants.flatMap(getVariantsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, variantShardPadding));
}

Also used : Broadcast(org.apache.spark.broadcast.Broadcast) VCFHeader(htsjdk.variant.vcf.VCFHeader) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) IndexUtils(org.broadinstitute.hellbender.utils.IndexUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) VariantFilterLibrary(org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) VariantFilter(org.broadinstitute.hellbender.engine.filters.VariantFilter) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) StreamSupport(java.util.stream.StreamSupport) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) VariantFilter(org.broadinstitute.hellbender.engine.filters.VariantFilter) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 18 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class FindBadGenomicKmersSpark method processRefRDD.

/**
     * Do a map/reduce on an RDD of genomic sequences:
     * Kmerize, mapping to a pair <kmer,1>, reduce by summing values by key, filter out <kmer,N> where
     * N <= MAX_KMER_FREQ, and collect the high frequency kmers back in the driver.
     */
@VisibleForTesting
static List<SVKmer> processRefRDD(final int kSize, final int maxDUSTScore, final int maxKmerFreq, final JavaRDD<byte[]> refRDD) {
    final int nPartitions = refRDD.getNumPartitions();
    final int hashSize = 2 * REF_RECORDS_PER_PARTITION;
    final int arrayCap = REF_RECORDS_PER_PARTITION / 100;
    return refRDD.mapPartitions(seqItr -> {
        final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
        while (seqItr.hasNext()) {
            final byte[] seq = seqItr.next();
            SVDUSTFilteredKmerizer.stream(seq, kSize, maxDUSTScore, new SVKmerLong()).map(kmer -> kmer.canonical(kSize)).forEach(kmer -> {
                final KmerAndCount entry = kmerCounts.find(kmer);
                if (entry == null)
                    kmerCounts.add(new KmerAndCount((SVKmerLong) kmer));
                else
                    entry.bumpCount();
            });
        }
        return kmerCounts.iterator();
    }).mapToPair(entry -> new Tuple2<>(entry.getKey(), entry.getValue())).partitionBy(new HashPartitioner(nPartitions)).mapPartitions(pairItr -> {
        final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
        while (pairItr.hasNext()) {
            final Tuple2<SVKmer, Integer> pair = pairItr.next();
            final SVKmer kmer = pair._1();
            final int count = pair._2();
            KmerAndCount entry = kmerCounts.find(kmer);
            if (entry == null)
                kmerCounts.add(new KmerAndCount((SVKmerLong) kmer, count));
            else
                entry.bumpCount(count);
        }
        final List<SVKmer> highFreqKmers = new ArrayList<>(arrayCap);
        for (KmerAndCount kmerAndCount : kmerCounts) {
            if (kmerAndCount.grabCount() > maxKmerFreq)
                highFreqKmers.add(kmerAndCount.getKey());
        }
        return highFreqKmers.iterator();
    }).collect();
}

Also used : Output(com.esotericsoftware.kryo.io.Output) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SAMFileHeader(htsjdk.samtools.SAMFileHeader) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Kryo(com.esotericsoftware.kryo.Kryo) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) HopscotchMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap) Input(com.esotericsoftware.kryo.io.Input) HopscotchSet(org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet) JavaRDD(org.apache.spark.api.java.JavaRDD) DefaultSerializer(com.esotericsoftware.kryo.DefaultSerializer) HashPartitioner(org.apache.spark.HashPartitioner) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) IOException(java.io.IOException) Tuple2(scala.Tuple2) InputStreamReader(java.io.InputStreamReader) StructuralVariationSparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariationSparkProgramGroup) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) BufferedReader(java.io.BufferedReader) HopscotchMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap) Tuple2(scala.Tuple2) HashPartitioner(org.apache.spark.HashPartitioner) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 19 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class FindBreakpointEvidenceSpark method getKmerIntervals.

/** find kmers for each interval */
@VisibleForTesting
static Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> getKmerIntervals(final Params params, final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final int nIntervals, final Set<SVKmer> kmerKillSet, final JavaRDD<GATKRead> reads, final Locations locations) {
    final Broadcast<Set<SVKmer>> broadcastKmerKillSet = ctx.broadcast(kmerKillSet);
    final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNameAndIntervalsMultiMap = ctx.broadcast(qNamesMultiMap);
    // given a set of template names with interval IDs and a kill set of ubiquitous kmers,
    // produce a set of interesting kmers for each interval ID
    final int kmersPerPartitionGuess = params.cleanerKmersPerPartitionGuess;
    final int minKmers = params.cleanerMinKmerCount;
    final int maxKmers = params.cleanerMaxKmerCount;
    final int maxIntervals = params.cleanerMaxIntervals;
    final int kSize = params.kSize;
    final int maxDUSTScore = params.maxDUSTScore;
    final List<KmerAndInterval> kmerIntervals = reads.mapPartitionsToPair(readItr -> new MapPartitioner<>(readItr, new QNameKmerizer(broadcastQNameAndIntervalsMultiMap.value(), broadcastKmerKillSet.value(), kSize, maxDUSTScore)).iterator(), false).reduceByKey(Integer::sum).mapPartitions(itr -> new KmerCleaner(itr, kmersPerPartitionGuess, minKmers, maxKmers, maxIntervals).iterator()).collect();
    broadcastQNameAndIntervalsMultiMap.destroy();
    broadcastKmerKillSet.destroy();
    final int[] intervalKmerCounts = new int[nIntervals];
    for (final KmerAndInterval kmerAndInterval : kmerIntervals) {
        intervalKmerCounts[kmerAndInterval.getIntervalId()] += 1;
    }
    final Set<Integer> intervalsToKill = new HashSet<>();
    final List<AlignedAssemblyOrExcuse> intervalDispositions = new ArrayList<>();
    for (int idx = 0; idx != nIntervals; ++idx) {
        if (intervalKmerCounts[idx] < params.minKmersPerInterval) {
            intervalsToKill.add(idx);
            intervalDispositions.add(new AlignedAssemblyOrExcuse(idx, "FASTQ not written -- too few kmers"));
        }
    }
    qNamesMultiMap.removeIf(qNameAndInterval -> intervalsToKill.contains(qNameAndInterval.getIntervalId()));
    final List<KmerAndInterval> filteredKmerIntervals = kmerIntervals.stream().filter(kmerAndInterval -> !intervalsToKill.contains(kmerAndInterval.getIntervalId())).collect(SVUtils.arrayListCollector(kmerIntervals.size()));
    // record the kmers with their interval IDs
    if (locations.kmerFile != null) {
        try (final OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(BucketUtils.createFile(locations.kmerFile)))) {
            for (final KmerAndInterval kmerAndInterval : filteredKmerIntervals) {
                writer.write(kmerAndInterval.toString(kSize) + " " + kmerAndInterval.getIntervalId() + "\n");
            }
        } catch (final IOException ioe) {
            throw new GATKException("Can't write kmer intervals file " + locations.kmerFile, ioe);
        }
    }
    return new Tuple2<>(intervalDispositions, filteredKmerIntervals);
}

Also used : BwaMemIndexSingleton(org.broadinstitute.hellbender.utils.bwa.BwaMemIndexSingleton) IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ArgumentCollection(org.broadinstitute.barclay.argparser.ArgumentCollection) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) BwaMemAlignment(org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment) Function(java.util.function.Function) BwaMemAligner(org.broadinstitute.hellbender.utils.bwa.BwaMemAligner) HopscotchUniqueMultiMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap) FermiLiteAssembly(org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembly) FermiLiteAssembler(org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembler) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) Broadcast(org.apache.spark.broadcast.Broadcast) HashPartitioner(org.apache.spark.HashPartitioner) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StructuralVariationSparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariationSparkProgramGroup) FindBreakpointEvidenceSparkArgumentCollection(org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.FindBreakpointEvidenceSparkArgumentCollection) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) Utils(org.broadinstitute.hellbender.utils.Utils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) htsjdk.samtools(htsjdk.samtools) MapPartitioner(org.broadinstitute.hellbender.tools.spark.utils.MapPartitioner) LogManager(org.apache.logging.log4j.LogManager) HopscotchUniqueMultiMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap) Tuple2(scala.Tuple2) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 20 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class MarkDuplicatesSparkUtils method generateMetrics.

static JavaPairRDD<String, DuplicationMetrics> generateMetrics(final SAMFileHeader header, final JavaRDD<GATKRead> reads) {
    return reads.filter(read -> !read.isSecondaryAlignment() && !read.isSupplementaryAlignment()).mapToPair(read -> {
        final String library = LibraryIdGenerator.getLibraryName(header, read.getReadGroup());
        DuplicationMetrics metrics = new DuplicationMetrics();
        metrics.LIBRARY = library;
        if (read.isUnmapped()) {
            ++metrics.UNMAPPED_READS;
        } else if (!read.isPaired() || read.mateIsUnmapped()) {
            ++metrics.UNPAIRED_READS_EXAMINED;
        } else {
            ++metrics.READ_PAIRS_EXAMINED;
        }
        if (read.isDuplicate()) {
            if (!read.isPaired() || read.mateIsUnmapped()) {
                ++metrics.UNPAIRED_READ_DUPLICATES;
            } else {
                ++metrics.READ_PAIR_DUPLICATES;
            }
        }
        if (read.hasAttribute(OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME)) {
            metrics.READ_PAIR_OPTICAL_DUPLICATES += read.getAttributeAsInteger(OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME);
        }
        return new Tuple2<>(library, metrics);
    }).foldByKey(new DuplicationMetrics(), (metricsSum, m) -> {
        if (metricsSum.LIBRARY == null) {
            metricsSum.LIBRARY = m.LIBRARY;
        }
        // This should never happen, as we grouped by key using library as the key.
        if (!metricsSum.LIBRARY.equals(m.LIBRARY)) {
            throw new GATKException("Two different libraries encountered while summing metrics: " + metricsSum.LIBRARY + " and " + m.LIBRARY);
        }
        metricsSum.UNMAPPED_READS += m.UNMAPPED_READS;
        metricsSum.UNPAIRED_READS_EXAMINED += m.UNPAIRED_READS_EXAMINED;
        metricsSum.READ_PAIRS_EXAMINED += m.READ_PAIRS_EXAMINED;
        metricsSum.UNPAIRED_READ_DUPLICATES += m.UNPAIRED_READ_DUPLICATES;
        metricsSum.READ_PAIR_DUPLICATES += m.READ_PAIR_DUPLICATES;
        metricsSum.READ_PAIR_OPTICAL_DUPLICATES += m.READ_PAIR_OPTICAL_DUPLICATES;
        return metricsSum;
    }).mapValues(metrics -> {
        DuplicationMetrics copy = metrics.copy();
        copy.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
        copy.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;
        copy.calculateDerivedMetrics();
        if (copy.ESTIMATED_LIBRARY_SIZE == null) {
            copy.ESTIMATED_LIBRARY_SIZE = 0L;
        }
        return copy;
    });
}

Also used : java.util(java.util) ReadCoordinateComparator(org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator) MetricsFile(htsjdk.samtools.metrics.MetricsFile) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) SAMFileHeader(htsjdk.samtools.SAMFileHeader) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Collectors(java.util.stream.Collectors) AuthHolder(org.broadinstitute.hellbender.engine.AuthHolder) Serializable(java.io.Serializable) ReadUtils(org.broadinstitute.hellbender.utils.read.ReadUtils) MetricsUtils(org.broadinstitute.hellbender.metrics.MetricsUtils) org.broadinstitute.hellbender.utils.read.markduplicates(org.broadinstitute.hellbender.utils.read.markduplicates) Utils(org.broadinstitute.hellbender.utils.Utils) StreamSupport(java.util.stream.StreamSupport) com.google.common.collect(com.google.common.collect) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple2(scala.Tuple2) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Aggregations

JavaRDD (org.apache.spark.api.java.JavaRDD)63 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)33 List (java.util.List)24 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)24 Collectors (java.util.stream.Collectors)20 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)20 Tuple2 (scala.Tuple2)20 Argument (org.broadinstitute.barclay.argparser.Argument)17 Broadcast (org.apache.spark.broadcast.Broadcast)15 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)15 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)14 IOException (java.io.IOException)14 UserException (org.broadinstitute.hellbender.exceptions.UserException)14 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)13 GATKSparkTool (org.broadinstitute.hellbender.engine.spark.GATKSparkTool)13 Serializable (java.io.Serializable)12 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)12 java.util (java.util)11 ArrayList (java.util.ArrayList)11