Search in sources :

Example 71 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk-protected by broadinstitute.

the class BaseQualityUnitTest method test.

@Test
public void test() {
    final SAMFileHeader SAM_HEADER = ArtificialReadUtils.createArtificialSamHeader(10, 0, 1000);
    final List<Allele> alleles = Arrays.asList(Allele.create((byte) 'A', true), Allele.create((byte) 'C', false));
    final AlleleList<Allele> alleleList = new IndexedAlleleList<>(alleles);
    // variant is a SNP at position 20
    final int chromosomeIndex = 5;
    final int variantSite = 20;
    final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantSite, variantSite, alleles).make();
    final org.broadinstitute.hellbender.utils.genotyper.SampleList sampleList = new IndexedSampleList("SAMPLE");
    //7 reads of length = 3 -- the middle base is at the variant position
    final Map<String, List<GATKRead>> readMap = new LinkedHashMap<>();
    final List<GATKRead> reads = new ArrayList<>();
    final byte[] quals = new byte[] { 30, 30, 30, 30, 25, 25, 25 };
    for (int r = 0; r < quals.length; r++) {
        final GATKRead read = ArtificialReadUtils.createArtificialRead(SAM_HEADER, "RRR00" + r, chromosomeIndex, 19, "ACG".getBytes(), new byte[] { 4, quals[r], 55 }, "3M");
        read.setMappingQuality(60);
        reads.add(read);
    }
    readMap.put("SAMPLE", reads);
    final ReadLikelihoods<Allele> likelihoods = new ReadLikelihoods<>(sampleList, alleleList, readMap);
    //we will make the first four reads ref (median position = 2) and the last three alt (median position 10, hence
    // median distance from end = 1)
    final LikelihoodMatrix<Allele> matrix = likelihoods.sampleMatrix(0);
    // log likelihoods are initialized to 0, so we can "turn on" a read for a particular allele by setting the
    // (allele, read) entry to 10
    matrix.set(0, 0, 10);
    matrix.set(0, 1, 10);
    matrix.set(0, 2, 10);
    matrix.set(0, 3, 10);
    matrix.set(1, 4, 10);
    matrix.set(1, 5, 10);
    matrix.set(1, 6, 10);
    final BaseQuality bq = new BaseQuality();
    final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_GENOTYPE);
    bq.annotate(null, vc, DUMMY_GENOTYPE, gb, likelihoods);
    final Genotype g = gb.make();
    final int[] medianRefAndAltQuals = (int[]) g.getExtendedAttribute(BaseQuality.KEY);
    Assert.assertEquals(medianRefAndAltQuals[0], 30);
    Assert.assertEquals(medianRefAndAltQuals[1], 25);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) org.broadinstitute.hellbender.utils.genotyper(org.broadinstitute.hellbender.utils.genotyper) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Test(org.testng.annotations.Test)

Example 72 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class IntervalWalkerSpark method getIntervalsFunction.

private static org.apache.spark.api.java.function.Function<Shard<GATKRead>, IntervalWalkerContext> getIntervalsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, int intervalShardPadding) {
    return (org.apache.spark.api.java.function.Function<Shard<GATKRead>, IntervalWalkerContext>) shard -> {
        SimpleInterval interval = shard.getInterval();
        SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(intervalShardPadding, sequenceDictionary);
        ReadsContext readsContext = new ReadsContext(new GATKDataSource<GATKRead>() {

            @Override
            public Iterator<GATKRead> iterator() {
                return shard.iterator();
            }

            @Override
            public Iterator<GATKRead> query(SimpleInterval interval) {
                return StreamSupport.stream(shard.spliterator(), false).filter(r -> IntervalUtils.overlaps(r, interval)).iterator();
            }
        }, shard.getInterval());
        ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return new IntervalWalkerContext(interval, readsContext, new ReferenceContext(reference, interval), new FeatureContext(features, interval));
    };
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Iterator(java.util.Iterator) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 73 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class LocusWalkerSpark method getAlignments.

/**
     * Loads alignments and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
     *
     * If no intervals were specified, returns all the alignments.
     *
     * @return all alignments as a {@link JavaRDD}, bounded by intervals if specified.
     */
public JavaRDD<LocusWalkerContext> getAlignments(JavaSparkContext ctx) {
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, readShardPadding, sequenceDictionary).stream()).collect(Collectors.toList());
    int maxLocatableSize = Math.min(readShardSize, readShardPadding);
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize, shuffle);
    Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedReads.flatMap(getAlignmentsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, getHeaderForReads(), getDownsamplingInfo()));
}
Also used : Broadcast(org.apache.spark.broadcast.Broadcast) java.util(java.util) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ImmutableList(com.google.common.collect.ImmutableList) StreamSupport(java.util.stream.StreamSupport) LIBSDownsamplingInfo(org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 74 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class AssemblyBasedCallerUtils method realignReadsToTheirBestHaplotype.

/**
     * Returns a map with the original read as a key and the realigned read as the value.
     * <p>
     *     Missing keys or equivalent key and value pairs mean that the read was not realigned.
     * </p>
     * @return never {@code null}
     */
public static Map<GATKRead, GATKRead> realignReadsToTheirBestHaplotype(final ReadLikelihoods<Haplotype> originalReadLikelihoods, final Haplotype refHaplotype, final Locatable paddedReferenceLoc) {
    final Collection<ReadLikelihoods<Haplotype>.BestAllele<Haplotype>> bestAlleles = originalReadLikelihoods.bestAlleles();
    final Map<GATKRead, GATKRead> result = new HashMap<>(bestAlleles.size());
    for (final ReadLikelihoods<Haplotype>.BestAllele<Haplotype> bestAllele : bestAlleles) {
        final GATKRead originalRead = bestAllele.read;
        final Haplotype bestHaplotype = bestAllele.allele;
        final boolean isInformative = bestAllele.isInformative();
        final GATKRead realignedRead = AlignmentUtils.createReadAlignedToRef(originalRead, bestHaplotype, refHaplotype, paddedReferenceLoc.getStart(), isInformative);
        result.put(originalRead, realignedRead);
    }
    return result;
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Haplotype(org.broadinstitute.hellbender.utils.haplotype.Haplotype) ReadLikelihoods(org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods)

Example 75 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class AssemblyBasedCallerUtils method finalizeRegion.

public static void finalizeRegion(final AssemblyRegion region, final boolean errorCorrectReads, final boolean dontUseSoftClippedBases, final byte minTailQuality, final SAMFileHeader readsHeader, final SampleList samplesList) {
    if (region.isFinalized()) {
        return;
    }
    // Loop through the reads hard clipping the adaptor and low quality tails
    final List<GATKRead> readsToUse = new ArrayList<>(region.getReads().size());
    for (final GATKRead myRead : region.getReads()) {
        final byte minTailQualityToUse = errorCorrectReads ? HaplotypeCallerEngine.MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION : minTailQuality;
        GATKRead clippedRead = ReadClipper.hardClipLowQualEnds(myRead, minTailQualityToUse);
        // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
        // otherwie revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
        // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
        // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
        // TODO -- w.r.t. the reference.  What really needs to happen is that kmers that occur before the
        // TODO -- reference haplotype start must be removed
        clippedRead = dontUseSoftClippedBases || !ReadUtils.hasWellDefinedFragmentSize(clippedRead) ? ReadClipper.hardClipSoftClippedBases(clippedRead) : ReadClipper.revertSoftClippedBases(clippedRead);
        clippedRead = clippedRead.isUnmapped() ? clippedRead : ReadClipper.hardClipAdaptorSequence(clippedRead);
        if (!clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0) {
            clippedRead = ReadClipper.hardClipToRegion(clippedRead, region.getExtendedSpan().getStart(), region.getExtendedSpan().getEnd());
            if (region.readOverlapsRegion(clippedRead) && clippedRead.getLength() > 0) {
                readsToUse.add(clippedRead);
            }
        }
    }
    // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
    // final List<GATKRead> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
    // TODO: sort may be unnecessary here
    Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader));
    // handle overlapping read pairs from the same fragment
    cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader);
    region.clearReads();
    region.addAll(readsToUse);
    region.setFinalized(true);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadCoordinateComparator(org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator)

Aggregations

GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)457 Test (org.testng.annotations.Test)286 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)163 SAMFileHeader (htsjdk.samtools.SAMFileHeader)87 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)59 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)40 ArrayList (java.util.ArrayList)34 Collectors (java.util.stream.Collectors)34 List (java.util.List)30 Cigar (htsjdk.samtools.Cigar)29 File (java.io.File)28 java.util (java.util)28 DataProvider (org.testng.annotations.DataProvider)28 JavaRDD (org.apache.spark.api.java.JavaRDD)26 Haplotype (org.broadinstitute.hellbender.utils.haplotype.Haplotype)26 Assert (org.testng.Assert)25 ReadPileup (org.broadinstitute.hellbender.utils.pileup.ReadPileup)24 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)22 Argument (org.broadinstitute.barclay.argparser.Argument)18 UserException (org.broadinstitute.hellbender.exceptions.UserException)18