Search in sources :

Example 26 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class LocusWalkerSpark method getAlignmentsFunction.

/**
     * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
     * @param bReferenceSource the reference source broadcast
     * @param bFeatureManager the feature manager broadcast
     * @param sequenceDictionary the sequence dictionary for the reads
     * @param header the reads header
     * @param downsamplingInfo the downsampling method for the reads
     * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
     */
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo) {
    return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
        SimpleInterval interval = shardedRead.getInterval();
        SimpleInterval paddedInterval = shardedRead.getPaddedInterval();
        Iterator<GATKRead> readIterator = shardedRead.iterator();
        ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
        FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();
        final Set<String> samples = header.getReadGroups().stream().map(SAMReadGroupRecord::getSample).collect(Collectors.toSet());
        LocusIteratorByState libs = new LocusIteratorByState(readIterator, downsamplingInfo, false, samples, header, true, false);
        IntervalOverlappingIterator<AlignmentContext> alignmentContexts = new IntervalOverlappingIterator<>(libs, ImmutableList.of(interval), sequenceDictionary);
        final Spliterator<AlignmentContext> alignmentContextSpliterator = Spliterators.spliteratorUnknownSize(alignmentContexts, 0);
        return StreamSupport.stream(alignmentContextSpliterator, false).map(alignmentContext -> {
            final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
            return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
        }).iterator();
    };
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Broadcast(org.apache.spark.broadcast.Broadcast) java.util(java.util) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ImmutableList(com.google.common.collect.ImmutableList) StreamSupport(java.util.stream.StreamSupport) LIBSDownsamplingInfo(org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 27 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class ReadWalkerSpark method getReadsFunction.

private static FlatMapFunction<Shard<GATKRead>, ReadWalkerContext> getReadsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, int readShardPadding) {
    return (FlatMapFunction<Shard<GATKRead>, ReadWalkerContext>) shard -> {
        SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(readShardPadding, sequenceDictionary);
        ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return StreamSupport.stream(shard.spliterator(), false).map(r -> {
            final SimpleInterval readInterval = getReadInterval(r);
            return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
        }).iterator();
    };
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Broadcast(org.apache.spark.broadcast.Broadcast) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) StreamSupport(java.util.stream.StreamSupport) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 28 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk-protected by broadinstitute.

the class HaplotypeCallerSparkIntegrationTest method testReferenceMultiSourceIsSerializable.

@Test
public void testReferenceMultiSourceIsSerializable() {
    final ReferenceMultiSource args = new ReferenceMultiSource((PipelineOptions) null, BaseTest.b37_2bit_reference_20_21, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SparkTestUtils.roundTripInKryo(args, ReferenceMultiSource.class, SparkContextFactory.getTestSparkContext().getConf());
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) HaplotypeCallerIntegrationTest(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerIntegrationTest)

Example 29 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class JoinReadsWithRefBasesSparkUnitTest method refBasesShuffleTest.

@Test(dataProvider = "bases", groups = "spark")
public void refBasesShuffleTest(List<GATKRead> reads, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<SimpleInterval> intervals) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    for (SimpleInterval i : intervals) {
        when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
    }
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    JavaPairRDD<GATKRead, ReferenceBases> rddResult = ShuffleJoinReadsWithRefBases.addBases(mockSource, rddReads);
    Map<GATKRead, ReferenceBases> result = rddResult.collectAsMap();
    for (KV<GATKRead, ReferenceBases> kv : kvReadRefBases) {
        ReferenceBases referenceBases = result.get(kv.getKey());
        Assert.assertNotNull(referenceBases);
        Assert.assertEquals(kv.getValue(), referenceBases);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 30 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class JoinReadsWithRefBasesSparkUnitTest method refBasesBroadcastTest.

@Test(dataProvider = "bases", groups = "spark")
public void refBasesBroadcastTest(List<GATKRead> reads, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<SimpleInterval> intervals) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    for (SimpleInterval i : intervals) {
        when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
    }
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    JavaPairRDD<GATKRead, ReferenceBases> rddResult = BroadcastJoinReadsWithRefBases.addBases(mockSource, rddReads);
    Map<GATKRead, ReferenceBases> result = rddResult.collectAsMap();
    for (KV<GATKRead, ReferenceBases> kv : kvReadRefBases) {
        ReferenceBases referenceBases = result.get(kv.getKey());
        Assert.assertNotNull(referenceBases);
        Assert.assertEquals(kv.getValue(), referenceBases);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)30 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)15 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)12 Collectors (java.util.stream.Collectors)11 JavaRDD (org.apache.spark.api.java.JavaRDD)11 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)11 StreamSupport (java.util.stream.StreamSupport)10 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)10 Test (org.testng.annotations.Test)10 List (java.util.List)9 Broadcast (org.apache.spark.broadcast.Broadcast)9 Argument (org.broadinstitute.barclay.argparser.Argument)8 org.broadinstitute.hellbender.engine (org.broadinstitute.hellbender.engine)8 PipelineOptions (com.google.cloud.dataflow.sdk.options.PipelineOptions)7 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)7 SAMFileHeader (htsjdk.samtools.SAMFileHeader)6 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)6 Tuple2 (scala.Tuple2)5