Search in sources :

Example 21 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk-protected by broadinstitute.

the class HaplotypeCallerGenotypingEngine method makeAnnotatedCall.

protected VariantContext makeAnnotatedCall(byte[] ref, SimpleInterval refLoc, FeatureContext tracker, SAMFileHeader header, VariantContext mergedVC, ReadLikelihoods<Allele> readAlleleLikelihoods, VariantContext call) {
    final SimpleInterval locus = new SimpleInterval(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getEnd());
    final SimpleInterval refLocInterval = new SimpleInterval(refLoc);
    final ReferenceDataSource refData = new ReferenceMemorySource(new ReferenceBases(ref, refLocInterval), header.getSequenceDictionary());
    final ReferenceContext referenceContext = new ReferenceContext(refData, locus, refLocInterval);
    final VariantContext untrimmedResult = annotationEngine.annotateContext(call, tracker, referenceContext, readAlleleLikelihoods, a -> true);
    return call.getAlleles().size() == mergedVC.getAlleles().size() ? untrimmedResult : GATKVariantContextUtils.reverseTrimAlleles(untrimmedResult);
}
Also used : ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReferenceDataSource(org.broadinstitute.hellbender.engine.ReferenceDataSource) ReferenceMemorySource(org.broadinstitute.hellbender.engine.ReferenceMemorySource) ReferenceContext(org.broadinstitute.hellbender.engine.ReferenceContext) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 22 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class BAQUnitTest method createData1.

@DataProvider(name = "data")
public Object[][] createData1() {
    List<BAQTest> params = new ArrayList<>();
    SAMSequenceDictionary dict = new SAMSequenceDictionary();
    dict.addSequence(new SAMSequenceRecord("1", Integer.MAX_VALUE));
    params.add(new BAQTest("GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG", "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG", "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"));
    params.add(new BAQTest("GCTTTTTCTCCTCCTG", "GCTTTTCCTCCTCCTG", "IIHGGGIHHIIHHIIH", "EI410..0HIIHHIIE"));
    final String refString1 = "AAATTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATAG";
    final ReferenceDataSource rds1 = new ReferenceMemorySource(new ReferenceBases(refString1.getBytes(), new SimpleInterval("1", 9999807, 10000032)), dict);
    // big and complex, also does a cap from 3 to 4!
    params.add(new BAQTest(-3, 9999810L, "49M1I126M1I20M1I25M", refString1, "TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", ">IHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?B3BBC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@C<>5;<A5=A;>=64>???B>=6497<<;;<;>2?>BA@??A6<<A59", ">EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@%<>5;<A5=A;>=64>???B;86497<<;;<;>2?>BA@??A6<<A59", rds1));
    final String refString2 = "CCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCC";
    final ReferenceDataSource rds2 = new ReferenceMemorySource(new ReferenceBases(refString2.getBytes(), new SimpleInterval("1", 9999963, 10000004)), dict);
    // now changes
    params.add(new BAQTest(-3, 9999966L, "36M", refString2, "AGTAGCTGGGACTACAGGCACCCACCACCACGCCTG", "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9", "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9", rds2));
    final String refString3 = "CCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATA";
    final ReferenceDataSource rds3 = new ReferenceMemorySource(new ReferenceBases(refString3.getBytes(), new SimpleInterval("1", 9999990, 10000031)), dict);
    // raw base qualities are low -- but they shouldn't be capped
    params.add(new BAQTest(-3, 9999993L, "4=13X2=3X1=4X2=4X1=2X", refString3, "CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8", "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8", rds3));
    List<Object[]> params2 = new ArrayList<>();
    for (BAQTest x : params) params2.add(new Object[] { x });
    return params2.toArray(new Object[][] {});
}
Also used : ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReferenceDataSource(org.broadinstitute.hellbender.engine.ReferenceDataSource) ReferenceMemorySource(org.broadinstitute.hellbender.engine.ReferenceMemorySource) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DataProvider(org.testng.annotations.DataProvider)

Example 23 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class AddContextDataToReadSpark method addUsingOverlapsPartitioning.

/**
     * Add context data ({@link ReadContextData}) to reads, using overlaps partitioning to avoid a shuffle.
     * @param ctx the Spark context
     * @param mappedReads the coordinate-sorted reads
     * @param referenceSource the reference source
     * @param variants the coordinate-sorted variants
     * @param sequenceDictionary the sequence dictionary for the reads
     * @param shardSize the maximum size of each shard, in bases
     * @param shardPadding amount of extra context around each shard, in bases
     * @return a RDD of read-context pairs, in coordinate-sorted order
     */
private static JavaPairRDD<GATKRead, ReadContextData> addUsingOverlapsPartitioning(final JavaSparkContext ctx, final JavaRDD<GATKRead> mappedReads, final ReferenceMultiSource referenceSource, final JavaRDD<GATKVariant> variants, final SAMSequenceDictionary sequenceDictionary, final int shardSize, final int shardPadding) {
    final List<SimpleInterval> intervals = IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    // use unpadded shards (padding is only needed for reference bases)
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
    final Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceSource);
    final IntervalsSkipList<GATKVariant> variantSkipList = new IntervalsSkipList<>(variants.collect());
    final Broadcast<IntervalsSkipList<GATKVariant>> variantsBroadcast = ctx.broadcast(variantSkipList);
    int maxLocatableSize = Math.min(shardSize, shardPadding);
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, mappedReads, GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize);
    return shardedReads.flatMapToPair(new PairFlatMapFunction<Shard<GATKRead>, GATKRead, ReadContextData>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Iterator<Tuple2<GATKRead, ReadContextData>> call(Shard<GATKRead> shard) throws Exception {
            // get reference bases for this shard (padded)
            SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(shardPadding, sequenceDictionary);
            ReferenceBases referenceBases = bReferenceSource.getValue().getReferenceBases(null, paddedInterval);
            final IntervalsSkipList<GATKVariant> intervalsSkipList = variantsBroadcast.getValue();
            Iterator<Tuple2<GATKRead, ReadContextData>> transform = Iterators.transform(shard.iterator(), new Function<GATKRead, Tuple2<GATKRead, ReadContextData>>() {

                @Nullable
                @Override
                public Tuple2<GATKRead, ReadContextData> apply(@Nullable GATKRead r) {
                    List<GATKVariant> overlappingVariants;
                    if (SimpleInterval.isValid(r.getContig(), r.getStart(), r.getEnd())) {
                        overlappingVariants = intervalsSkipList.getOverlapping(new SimpleInterval(r));
                    } else {
                        //Sometimes we have reads that do not form valid intervals (reads that do not consume any ref bases, eg CIGAR 61S90I
                        //In those cases, we'll just say that nothing overlaps the read
                        overlappingVariants = Collections.emptyList();
                    }
                    return new Tuple2<>(r, new ReadContextData(referenceBases, overlappingVariants));
                }
            });
            // only include reads that start in the shard
            return Iterators.filter(transform, r -> r._1().getStart() >= shard.getStart() && r._1().getStart() <= shard.getEnd());
        }
    });
}
Also used : PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Iterators(com.google.common.collect.Iterators) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) JavaRDD(org.apache.spark.api.java.JavaRDD) Nullable(javax.annotation.Nullable) Broadcast(org.apache.spark.broadcast.Broadcast) IntervalsSkipList(org.broadinstitute.hellbender.utils.collections.IntervalsSkipList) Function(com.google.common.base.Function) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) Shard(org.broadinstitute.hellbender.engine.Shard) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) ShardBoundary(org.broadinstitute.hellbender.engine.ShardBoundary) Collections(java.util.Collections) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ShardBoundary(org.broadinstitute.hellbender.engine.ShardBoundary) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Function(com.google.common.base.Function) IntervalsSkipList(org.broadinstitute.hellbender.utils.collections.IntervalsSkipList) Iterator(java.util.Iterator) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) Shard(org.broadinstitute.hellbender.engine.Shard) Nullable(javax.annotation.Nullable)

Example 24 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class JoinReadsWithRefBasesSparkUnitTest method refBasesShuffleTest.

@Test(dataProvider = "bases", groups = "spark")
public void refBasesShuffleTest(List<GATKRead> reads, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<SimpleInterval> intervals) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    for (SimpleInterval i : intervals) {
        when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
    }
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    JavaPairRDD<GATKRead, ReferenceBases> rddResult = ShuffleJoinReadsWithRefBases.addBases(mockSource, rddReads);
    Map<GATKRead, ReferenceBases> result = rddResult.collectAsMap();
    for (KV<GATKRead, ReferenceBases> kv : kvReadRefBases) {
        ReferenceBases referenceBases = result.get(kv.getKey());
        Assert.assertNotNull(referenceBases);
        Assert.assertEquals(kv.getValue(), referenceBases);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 25 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class JoinReadsWithRefBasesSparkUnitTest method refBasesBroadcastTest.

@Test(dataProvider = "bases", groups = "spark")
public void refBasesBroadcastTest(List<GATKRead> reads, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<SimpleInterval> intervals) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    for (SimpleInterval i : intervals) {
        when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
    }
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    JavaPairRDD<GATKRead, ReferenceBases> rddResult = BroadcastJoinReadsWithRefBases.addBases(mockSource, rddReads);
    Map<GATKRead, ReferenceBases> result = rddResult.collectAsMap();
    for (KV<GATKRead, ReferenceBases> kv : kvReadRefBases) {
        ReferenceBases referenceBases = result.get(kv.getKey());
        Assert.assertNotNull(referenceBases);
        Assert.assertEquals(kv.getValue(), referenceBases);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)29 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)24 Test (org.testng.annotations.Test)15 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)14 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)6 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)6 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)5 ReferenceContext (org.broadinstitute.hellbender.engine.ReferenceContext)5 ReferenceDataSource (org.broadinstitute.hellbender.engine.ReferenceDataSource)5 ReferenceMemorySource (org.broadinstitute.hellbender.engine.ReferenceMemorySource)5 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4 ReadContextData (org.broadinstitute.hellbender.engine.ReadContextData)4 GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)4 PipelineOptions (com.google.cloud.dataflow.sdk.options.PipelineOptions)3 Allele (htsjdk.variant.variantcontext.Allele)3 VariantContext (htsjdk.variant.variantcontext.VariantContext)3 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3