Search in sources :

Example 16 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class ShuffleJoinReadsWithRefBases method addBases.

/**
     * Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
     *
     * @param referenceDataflowSource The source of the reference sequence information
     * @param reads The reads for which to extract reference sequence information
     * @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
     */
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
    // TODO: reimpl this method by calling out to the more complex version?
    SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
    JavaPairRDD<ReferenceShard, GATKRead> shardRead = reads.mapToPair(gatkRead -> {
        ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(gatkRead));
        return new Tuple2<>(shard, gatkRead);
    });
    JavaPairRDD<ReferenceShard, Iterable<GATKRead>> shardiRead = shardRead.groupByKey();
    return shardiRead.flatMapToPair(in -> {
        List<Tuple2<GATKRead, ReferenceBases>> out = Lists.newArrayList();
        Iterable<GATKRead> iReads = in._2();
        final List<SimpleInterval> readWindows = Utils.stream(iReads).map(read -> windowFunction.apply(read)).collect(Collectors.toList());
        SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
        ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
        for (GATKRead r : iReads) {
            final ReferenceBases subset = bases.getSubset(windowFunction.apply(r));
            out.add(new Tuple2<>(r, subset));
        }
        return out.iterator();
    });
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) List(java.util.List) Lists(com.google.common.collect.Lists) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Utils(org.broadinstitute.hellbender.utils.Utils) StreamSupport(java.util.stream.StreamSupport) SerializableFunction(org.broadinstitute.hellbender.utils.SerializableFunction) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) JavaRDD(org.apache.spark.api.java.JavaRDD) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 17 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class ReferenceTwoBitSource method getReferenceBases.

/**
     * Gets the reference bases spanning the requested interval. If the interval ends beyond the end of its
     * contig according to our reference source's dictionary, it will be truncated at the contig end.
     *
     * @param pipelineOptions pipeline options (may be null)
     * @param interval query interval
     * @return A ReferenceBases containing the reference bases spanning the requested interval, cropped at the
     *         contig end if necessary
     */
@Override
public ReferenceBases getReferenceBases(PipelineOptions pipelineOptions, SimpleInterval interval) throws IOException {
    final SimpleInterval queryInterval = cropIntervalAtContigEnd(interval);
    final String bases = twoBitFile.extract(simpleIntervalToReferenceRegion(queryInterval));
    return new ReferenceBases(bases.getBytes(), queryInterval);
}
Also used : ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 18 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class ReferenceAPISourceUnitTest method testOffContig.

@Test(groups = "cloud", expectedExceptions = UserException.ReferenceAPIReturnedUnexpectedNumberOfBytes.class)
public void testOffContig() throws Exception {
    //Test the case of a query that starts before the end of the contig and runs off
    final int b37Chr1Len = 249250621;
    final int pageSize = 300;
    final int beforeEnd = 11;
    final int pastEnd = 12;
    final int start = b37Chr1Len - beforeEnd;
    final int end = b37Chr1Len + pastEnd;
    final ReferenceBases bases = queryReferenceAPI(ReferenceAPISource.HS37D5_REF_ID, new SimpleInterval("1", start, end), pageSize);
}
Also used : ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 19 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class ReferenceAPISourceUnitTest method testReferenceSourceMultiSmallPagesQuery.

@Test(groups = "cloud")
public void testReferenceSourceMultiSmallPagesQuery() {
    int pageSize = 300;
    // not a multiple of pageSize (testing the fetching of a partial page)
    final ReferenceBases bases1 = queryReferenceAPI(HS37D5_REF_ID, new SimpleInterval("1", 50000, 51000), pageSize);
    // multiple of pageSize (testing ending on an exact page boundary)
    final ReferenceBases bases2 = queryReferenceAPI(HS37D5_REF_ID, new SimpleInterval("1", 50025, 50924), pageSize);
    Assert.assertNotNull(bases1);
    Assert.assertNotNull(bases1.getBases());
    Assert.assertNotNull(bases2);
    Assert.assertNotNull(bases2.getBases());
    // those SimpleIntervals include the end, hence +1
    Assert.assertEquals(bases1.getBases().length, 1001, "Wrong number of bases returned");
    Assert.assertEquals(bases2.getBases().length, 900, "Wrong number of bases returned");
    // grab some bases they should have in common
    ReferenceBases seam1 = bases1.getSubset(new SimpleInterval("1", 50025, 50902));
    ReferenceBases seam2 = bases2.getSubset(new SimpleInterval("1", 50025, 50902));
    Assert.assertEquals(seam1.getBases(), seam2.getBases(), "seam doesn't match (paging bug?)");
}
Also used : ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 20 with ReferenceBases

use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.

the class ReferenceAPISourceUnitTest method testReferenceSourceVaryingPageSizeQuery.

@Test(groups = "cloud")
public void testReferenceSourceVaryingPageSizeQuery() {
    SimpleInterval interval = new SimpleInterval("1", 50000, 50050);
    final ReferenceBases bases1 = queryReferenceAPI(HS37D5_REF_ID, interval);
    final ReferenceBases bases2 = queryReferenceAPI(HS37D5_REF_ID, interval, 10);
    Assert.assertNotNull(bases1);
    Assert.assertNotNull(bases1.getBases());
    Assert.assertNotNull(bases2);
    Assert.assertNotNull(bases2.getBases());
    Assert.assertEquals(bases1.getBases(), bases2.getBases(), "bases should match despite different paging size");
}
Also used : ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)29 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)24 Test (org.testng.annotations.Test)15 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)14 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)6 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)6 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)5 ReferenceContext (org.broadinstitute.hellbender.engine.ReferenceContext)5 ReferenceDataSource (org.broadinstitute.hellbender.engine.ReferenceDataSource)5 ReferenceMemorySource (org.broadinstitute.hellbender.engine.ReferenceMemorySource)5 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4 ReadContextData (org.broadinstitute.hellbender.engine.ReadContextData)4 GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)4 PipelineOptions (com.google.cloud.dataflow.sdk.options.PipelineOptions)3 Allele (htsjdk.variant.variantcontext.Allele)3 VariantContext (htsjdk.variant.variantcontext.VariantContext)3 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3