use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class AddContextDataToReadSparkOptimized method fillContext.
/**
* Given a shard that has reads and variants, query Google Genomics' Reference server and get reference info
* (including an extra margin on either side), and fill that and the correct variants into readContext.
*/
public static ContextShard fillContext(ReferenceMultiSource refSource, ContextShard shard) {
if (null == shard)
return null;
// use the function to make sure we get the exact correct amount of reference bases
int start = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction = refSource.getReferenceWindowFunction();
for (GATKRead r : shard.reads) {
SimpleInterval readRefs = referenceWindowFunction.apply(r);
start = Math.min(readRefs.getStart(), start);
end = Math.max(readRefs.getEnd(), end);
}
if (start == Integer.MAX_VALUE) {
// there are no reads in this shard, so we're going to remove it
return null;
}
SimpleInterval refInterval = new SimpleInterval(shard.interval.getContig(), start, end);
ReferenceBases refBases;
try {
refBases = refSource.getReferenceBases(null, refInterval);
} catch (IOException x) {
throw new GATKException("Unable to read the reference");
}
ArrayList<ReadContextData> readContext = new ArrayList<>();
for (GATKRead r : shard.reads) {
SimpleInterval readInterval = new SimpleInterval(r);
List<GATKVariant> variantsOverlappingThisRead = shard.variantsOverlapping(readInterval);
// we pass all the bases. That's better because this way it's just a shared
// pointer instead of being an array copy. Downstream processing is fine with having
// extra bases (it expects a few, actually).
readContext.add(new ReadContextData(refBases, variantsOverlappingThisRead));
}
return shard.withReadContext(readContext);
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead, T> with key's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param keyedByRead The read-keyed RDD for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object and the value
*/
public static <T> JavaPairRDD<GATKRead, Tuple2<T, ReferenceBases>> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaPairRDD<GATKRead, T> keyedByRead) {
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, Tuple2<GATKRead, T>> shardRead = keyedByRead.mapToPair(pair -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(pair._1()));
return new Tuple2<>(shard, pair);
});
JavaPairRDD<ReferenceShard, Iterable<Tuple2<GATKRead, T>>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, Tuple2<T, ReferenceBases>>> out = Lists.newArrayList();
Iterable<Tuple2<GATKRead, T>> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(pair -> windowFunction.apply(pair._1())).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (Tuple2<GATKRead, T> p : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(p._1()));
out.add(new Tuple2<>(p._1(), new Tuple2<>(p._2(), subset)));
}
return out.iterator();
});
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class BaseRecalibrationEngineUnitTest method testCalculateIsIndel.
@Test(dataProvider = "CalculateIsIndelData")
public void testCalculateIsIndel(final String cigar, final boolean negStrand, final EventType mode, final int[] expected) {
final GATKRead read = ArtificialReadUtils.createArtificialRead(TextCigarCodec.decode(cigar));
read.setIsReverseStrand(negStrand);
// Fake reference data, since the indel calculation does not use the reference at all.
final ReferenceDataSource refSource = new ReferenceMemorySource(new ReferenceBases(Utils.repeatBytes((byte) 'A', read.getEnd() - read.getStart() + 1), new SimpleInterval(read)), ArtificialReadUtils.createArtificialSamHeader().getSequenceDictionary());
int[] isSNP = new int[read.getLength()];
int[] isInsertion = new int[isSNP.length];
int[] isDeletion = new int[isSNP.length];
BaseRecalibrationEngine.calculateIsSNPOrIndel(read, refSource, isSNP, isInsertion, isDeletion);
final int[] actual = (mode == EventType.BASE_INSERTION ? isInsertion : isDeletion);
Assert.assertEquals(actual, expected, "calculateIsSNPOrIndel() failed with " + mode + " and cigar " + cigar + " Expected " + Arrays.toString(expected) + " but got " + Arrays.toString(actual));
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class AssemblyRegion method determineAssemblyRegionBounds.
/**
* Helper method for {@link #createFromReadShard} that uses the provided activity profile and locus iterator
* to generate a set of assembly regions covering the fully-padded span of the provided Shard. The returned
* assembly regions will not contain any reads.
*
* @param shard Shard to divide into assembly regions
* @param locusIterator Iterator over pileups to be fed to the AssemblyRegionEvaluator
* @param activityProfile Activity profile to generate the assembly regions
* @param readsHeader header for the reads
* @param referenceContext reference data overlapping the shard's extended span (including padding)
* @param features features overlapping the shard's extended span (including padding)
* @param evaluator AssemblyRegionEvaluator used to label each locus as either active or inactive
* @param minRegionSize minimum size for each assembly region
* @param maxRegionSize maximum size for each assembly region
* @param assemblyRegionPadding each assembly region will be padded by this amount on each side
* @return A list of AssemblyRegions covering
*/
private static List<AssemblyRegion> determineAssemblyRegionBounds(final Shard<GATKRead> shard, final LocusIteratorByState locusIterator, final ActivityProfile activityProfile, final SAMFileHeader readsHeader, final ReferenceContext referenceContext, final FeatureContext features, final AssemblyRegionEvaluator evaluator, final int minRegionSize, final int maxRegionSize, final int assemblyRegionPadding) {
// Use the provided activity profile to determine the bounds of each assembly region:
List<AssemblyRegion> assemblyRegions = new ArrayList<>();
locusIterator.forEachRemaining(pileup -> {
if (!activityProfile.isEmpty()) {
final boolean forceConversion = pileup.getLocation().getStart() != activityProfile.getEnd() + 1;
assemblyRegions.addAll(activityProfile.popReadyAssemblyRegions(assemblyRegionPadding, minRegionSize, maxRegionSize, forceConversion));
}
if (shard.getPaddedInterval().contains(pileup.getLocation())) {
final SimpleInterval pileupInterval = new SimpleInterval(pileup.getLocation());
final ReferenceBases refBase = new ReferenceBases(new byte[] { referenceContext.getBases()[pileup.getLocation().getStart() - referenceContext.getWindow().getStart()] }, pileupInterval);
final ReferenceContext pileupRefContext = new ReferenceContext(new ReferenceMemorySource(refBase, readsHeader.getSequenceDictionary()), pileupInterval);
final ActivityProfileState profile = evaluator.isActive(pileup, pileupRefContext, features);
activityProfile.add(profile);
}
});
assemblyRegions.addAll(activityProfile.popReadyAssemblyRegions(assemblyRegionPadding, minRegionSize, maxRegionSize, true));
return assemblyRegions;
}
use of org.broadinstitute.hellbender.utils.reference.ReferenceBases in project gatk by broadinstitute.
the class HaplotypeCallerGenotypingEngine method makeAnnotatedCall.
protected VariantContext makeAnnotatedCall(byte[] ref, SimpleInterval refLoc, FeatureContext tracker, SAMFileHeader header, VariantContext mergedVC, ReadLikelihoods<Allele> readAlleleLikelihoods, VariantContext call) {
final SimpleInterval locus = new SimpleInterval(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getEnd());
final SimpleInterval refLocInterval = new SimpleInterval(refLoc);
final ReferenceDataSource refData = new ReferenceMemorySource(new ReferenceBases(ref, refLocInterval), header.getSequenceDictionary());
final ReferenceContext referenceContext = new ReferenceContext(refData, locus, refLocInterval);
final VariantContext untrimmedResult = annotationEngine.annotateContext(call, tracker, referenceContext, readAlleleLikelihoods, a -> true);
return call.getAlleles().size() == mergedVC.getAlleles().size() ? untrimmedResult : GATKVariantContextUtils.reverseTrimAlleles(untrimmedResult);
}
Aggregations