use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk-protected by broadinstitute.
the class HaplotypeCallerGenotypingEngine method makeAnnotatedCall.
protected VariantContext makeAnnotatedCall(byte[] ref, SimpleInterval refLoc, FeatureContext tracker, SAMFileHeader header, VariantContext mergedVC, ReadLikelihoods<Allele> readAlleleLikelihoods, VariantContext call) {
final SimpleInterval locus = new SimpleInterval(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getEnd());
final SimpleInterval refLocInterval = new SimpleInterval(refLoc);
final ReferenceDataSource refData = new ReferenceMemorySource(new ReferenceBases(ref, refLocInterval), header.getSequenceDictionary());
final ReferenceContext referenceContext = new ReferenceContext(refData, locus, refLocInterval);
final VariantContext untrimmedResult = annotationEngine.annotateContext(call, tracker, referenceContext, readAlleleLikelihoods, a -> true);
return call.getAlleles().size() == mergedVC.getAlleles().size() ? untrimmedResult : GATKVariantContextUtils.reverseTrimAlleles(untrimmedResult);
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class BAQUnitTest method createData1.
@DataProvider(name = "data")
public Object[][] createData1() {
List<BAQTest> params = new ArrayList<>();
SAMSequenceDictionary dict = new SAMSequenceDictionary();
dict.addSequence(new SAMSequenceRecord("1", Integer.MAX_VALUE));
params.add(new BAQTest("GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG", "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG", "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"));
params.add(new BAQTest("GCTTTTTCTCCTCCTG", "GCTTTTCCTCCTCCTG", "IIHGGGIHHIIHHIIH", "EI410..0HIIHHIIE"));
final String refString1 = "AAATTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATAG";
final ReferenceDataSource rds1 = new ReferenceMemorySource(new ReferenceBases(refString1.getBytes(), new SimpleInterval("1", 9999807, 10000032)), dict);
// big and complex, also does a cap from 3 to 4!
params.add(new BAQTest(-3, 9999810L, "49M1I126M1I20M1I25M", refString1, "TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", ">IHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?B3BBC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@C<>5;<A5=A;>=64>???B>=6497<<;;<;>2?>BA@??A6<<A59", ">EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@%<>5;<A5=A;>=64>???B;86497<<;;<;>2?>BA@??A6<<A59", rds1));
final String refString2 = "CCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCC";
final ReferenceDataSource rds2 = new ReferenceMemorySource(new ReferenceBases(refString2.getBytes(), new SimpleInterval("1", 9999963, 10000004)), dict);
// now changes
params.add(new BAQTest(-3, 9999966L, "36M", refString2, "AGTAGCTGGGACTACAGGCACCCACCACCACGCCTG", "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9", "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9", rds2));
final String refString3 = "CCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATA";
final ReferenceDataSource rds3 = new ReferenceMemorySource(new ReferenceBases(refString3.getBytes(), new SimpleInterval("1", 9999990, 10000031)), dict);
// raw base qualities are low -- but they shouldn't be capped
params.add(new BAQTest(-3, 9999993L, "4=13X2=3X1=4X2=4X1=2X", refString3, "CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8", "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8", rds3));
List<Object[]> params2 = new ArrayList<>();
for (BAQTest x : params) params2.add(new Object[] { x });
return params2.toArray(new Object[][] {});
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class AddContextDataToReadSpark method addUsingOverlapsPartitioning.
/**
* Add context data ({@link ReadContextData}) to reads, using overlaps partitioning to avoid a shuffle.
* @param ctx the Spark context
* @param mappedReads the coordinate-sorted reads
* @param referenceSource the reference source
* @param variants the coordinate-sorted variants
* @param sequenceDictionary the sequence dictionary for the reads
* @param shardSize the maximum size of each shard, in bases
* @param shardPadding amount of extra context around each shard, in bases
* @return a RDD of read-context pairs, in coordinate-sorted order
*/
private static JavaPairRDD<GATKRead, ReadContextData> addUsingOverlapsPartitioning(final JavaSparkContext ctx, final JavaRDD<GATKRead> mappedReads, final ReferenceMultiSource referenceSource, final JavaRDD<GATKVariant> variants, final SAMSequenceDictionary sequenceDictionary, final int shardSize, final int shardPadding) {
final List<SimpleInterval> intervals = IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
// use unpadded shards (padding is only needed for reference bases)
final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
final Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceSource);
final IntervalsSkipList<GATKVariant> variantSkipList = new IntervalsSkipList<>(variants.collect());
final Broadcast<IntervalsSkipList<GATKVariant>> variantsBroadcast = ctx.broadcast(variantSkipList);
int maxLocatableSize = Math.min(shardSize, shardPadding);
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, mappedReads, GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize);
return shardedReads.flatMapToPair(new PairFlatMapFunction<Shard<GATKRead>, GATKRead, ReadContextData>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<Tuple2<GATKRead, ReadContextData>> call(Shard<GATKRead> shard) throws Exception {
// get reference bases for this shard (padded)
SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(shardPadding, sequenceDictionary);
ReferenceBases referenceBases = bReferenceSource.getValue().getReferenceBases(null, paddedInterval);
final IntervalsSkipList<GATKVariant> intervalsSkipList = variantsBroadcast.getValue();
Iterator<Tuple2<GATKRead, ReadContextData>> transform = Iterators.transform(shard.iterator(), new Function<GATKRead, Tuple2<GATKRead, ReadContextData>>() {
@Nullable
@Override
public Tuple2<GATKRead, ReadContextData> apply(@Nullable GATKRead r) {
List<GATKVariant> overlappingVariants;
if (SimpleInterval.isValid(r.getContig(), r.getStart(), r.getEnd())) {
overlappingVariants = intervalsSkipList.getOverlapping(new SimpleInterval(r));
} else {
//Sometimes we have reads that do not form valid intervals (reads that do not consume any ref bases, eg CIGAR 61S90I
//In those cases, we'll just say that nothing overlaps the read
overlappingVariants = Collections.emptyList();
}
return new Tuple2<>(r, new ReadContextData(referenceBases, overlappingVariants));
}
});
// only include reads that start in the shard
return Iterators.filter(transform, r -> r._1().getStart() >= shard.getStart() && r._1().getStart() <= shard.getEnd());
}
});
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class JoinReadsWithRefBasesSparkUnitTest method refBasesShuffleTest.
@Test(dataProvider = "bases", groups = "spark")
public void refBasesShuffleTest(List<GATKRead> reads, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<SimpleInterval> intervals) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
for (SimpleInterval i : intervals) {
when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
}
when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
JavaPairRDD<GATKRead, ReferenceBases> rddResult = ShuffleJoinReadsWithRefBases.addBases(mockSource, rddReads);
Map<GATKRead, ReferenceBases> result = rddResult.collectAsMap();
for (KV<GATKRead, ReferenceBases> kv : kvReadRefBases) {
ReferenceBases referenceBases = result.get(kv.getKey());
Assert.assertNotNull(referenceBases);
Assert.assertEquals(kv.getValue(), referenceBases);
}
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class JoinReadsWithRefBasesSparkUnitTest method refBasesBroadcastTest.
@Test(dataProvider = "bases", groups = "spark")
public void refBasesBroadcastTest(List<GATKRead> reads, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<SimpleInterval> intervals) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
for (SimpleInterval i : intervals) {
when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
}
when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
JavaPairRDD<GATKRead, ReferenceBases> rddResult = BroadcastJoinReadsWithRefBases.addBases(mockSource, rddReads);
Map<GATKRead, ReferenceBases> result = rddResult.collectAsMap();
for (KV<GATKRead, ReferenceBases> kv : kvReadRefBases) {
ReferenceBases referenceBases = result.get(kv.getKey());
Assert.assertNotNull(referenceBases);
Assert.assertEquals(kv.getValue(), referenceBases);
}
}
Aggregations