use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param reads The reads for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
*/
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
// TODO: reimpl this method by calling out to the more complex version?
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, GATKRead> shardRead = reads.mapToPair(gatkRead -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(gatkRead));
return new Tuple2<>(shard, gatkRead);
});
JavaPairRDD<ReferenceShard, Iterable<GATKRead>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, ReferenceBases>> out = Lists.newArrayList();
Iterable<GATKRead> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(read -> windowFunction.apply(read)).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (GATKRead r : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(r));
out.add(new Tuple2<>(r, subset));
}
return out.iterator();
});
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class ReferenceTwoBitSource method getReferenceBases.
/**
* Gets the reference bases spanning the requested interval. If the interval ends beyond the end of its
* contig according to our reference source's dictionary, it will be truncated at the contig end.
*
* @param pipelineOptions pipeline options (may be null)
* @param interval query interval
* @return A ReferenceBases containing the reference bases spanning the requested interval, cropped at the
* contig end if necessary
*/
@Override
public ReferenceBases getReferenceBases(PipelineOptions pipelineOptions, SimpleInterval interval) throws IOException {
final SimpleInterval queryInterval = cropIntervalAtContigEnd(interval);
final String bases = twoBitFile.extract(simpleIntervalToReferenceRegion(queryInterval));
return new ReferenceBases(bases.getBytes(), queryInterval);
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class ReferenceAPISourceUnitTest method testOffContig.
@Test(groups = "cloud", expectedExceptions = UserException.ReferenceAPIReturnedUnexpectedNumberOfBytes.class)
public void testOffContig() throws Exception {
//Test the case of a query that starts before the end of the contig and runs off
final int b37Chr1Len = 249250621;
final int pageSize = 300;
final int beforeEnd = 11;
final int pastEnd = 12;
final int start = b37Chr1Len - beforeEnd;
final int end = b37Chr1Len + pastEnd;
final ReferenceBases bases = queryReferenceAPI(ReferenceAPISource.HS37D5_REF_ID, new SimpleInterval("1", start, end), pageSize);
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class ReferenceAPISourceUnitTest method testReferenceSourceMultiSmallPagesQuery.
@Test(groups = "cloud")
public void testReferenceSourceMultiSmallPagesQuery() {
int pageSize = 300;
// not a multiple of pageSize (testing the fetching of a partial page)
final ReferenceBases bases1 = queryReferenceAPI(HS37D5_REF_ID, new SimpleInterval("1", 50000, 51000), pageSize);
// multiple of pageSize (testing ending on an exact page boundary)
final ReferenceBases bases2 = queryReferenceAPI(HS37D5_REF_ID, new SimpleInterval("1", 50025, 50924), pageSize);
Assert.assertNotNull(bases1);
Assert.assertNotNull(bases1.getBases());
Assert.assertNotNull(bases2);
Assert.assertNotNull(bases2.getBases());
// those SimpleIntervals include the end, hence +1
Assert.assertEquals(bases1.getBases().length, 1001, "Wrong number of bases returned");
Assert.assertEquals(bases2.getBases().length, 900, "Wrong number of bases returned");
// grab some bases they should have in common
ReferenceBases seam1 = bases1.getSubset(new SimpleInterval("1", 50025, 50902));
ReferenceBases seam2 = bases2.getSubset(new SimpleInterval("1", 50025, 50902));
Assert.assertEquals(seam1.getBases(), seam2.getBases(), "seam doesn't match (paging bug?)");
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class ReferenceAPISourceUnitTest method testReferenceSourceVaryingPageSizeQuery.
@Test(groups = "cloud")
public void testReferenceSourceVaryingPageSizeQuery() {
SimpleInterval interval = new SimpleInterval("1", 50000, 50050);
final ReferenceBases bases1 = queryReferenceAPI(HS37D5_REF_ID, interval);
final ReferenceBases bases2 = queryReferenceAPI(HS37D5_REF_ID, interval, 10);
Assert.assertNotNull(bases1);
Assert.assertNotNull(bases1.getBases());
Assert.assertNotNull(bases2);
Assert.assertNotNull(bases2.getBases());
Assert.assertEquals(bases1.getBases(), bases2.getBases(), "bases should match despite different paging size");
}
Aggregations