Search in sources :

Example 11 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class NormalizeFasta method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    Utils.validateArg(!INPUT.getAbsoluteFile().equals(OUTPUT.getAbsoluteFile()), "Input and output cannot be the same file.");
    try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(INPUT, TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE);
        final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT)) {
        ReferenceSequence seq = null;
        while ((seq = ref.nextSequence()) != null) {
            final String name = seq.getName();
            final byte[] bases = seq.getBases();
            try {
                out.write(">");
                out.write(name);
                out.newLine();
                if (bases.length == 0) {
                    logger.warn("Sequence " + name + " contains 0 bases.");
                } else {
                    for (int i = 0; i < bases.length; ++i) {
                        if (i > 0 && i % LINE_LENGTH == 0)
                            out.write("\n");
                        out.write(bases[i]);
                    }
                    out.write("\n");
                }
            } catch (IOException ioe) {
                throw new RuntimeIOException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);
            }
        }
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    return null;
}
Also used : RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) IOException(java.io.IOException) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) BufferedWriter(java.io.BufferedWriter)

Example 12 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class CachingIndexedFastaSequenceFile method getSubsequenceAt.

/**
     * Gets the subsequence of the contig in the range [start,stop]
     *
     * Uses the sequence cache if possible, or updates the cache to handle the request.  If the range
     * is larger than the cache itself, just loads the sequence directly, not changing the cache at all
     *
     * @param contig Contig whose subsequence to retrieve.
     * @param start inclusive, 1-based start of region.
     * @param stop inclusive, 1-based stop of region.
     * @return The partial reference sequence associated with this range.  If preserveCase is false, then
     *         all of the bases in the ReferenceSequence returned by this method will be upper cased.
     */
@Override
public ReferenceSequence getSubsequenceAt(final String contig, long start, final long stop) {
    final ReferenceSequence result;
    if ((stop - start) >= cacheSize) {
        cacheMisses++;
        result = super.getSubsequenceAt(contig, start, stop);
        if (!preserveCase)
            StringUtil.toUpperCase(result.getBases());
        if (!preserveIUPAC)
            BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
    } else {
        // todo -- potential optimization is to check if contig.name == contig, as this in general will be true
        SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
        if (contigInfo == null) {
            throw new UserException.MissingContigInSequenceDictionary(contig, super.getSequenceDictionary());
        }
        if (stop > contigInfo.getSequenceLength())
            throw new SAMException("Query asks for data past end of contig. Query contig " + contig + " start:" + start + " stop:" + stop + " contigLength:" + contigInfo.getSequenceLength());
        if (start < cache.start || stop > cache.stop || cache.seq == null || cache.seq.getContigIndex() != contigInfo.getSequenceIndex()) {
            cacheMisses++;
            cache.start = Math.max(start - cacheMissBackup, 0);
            cache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength());
            cache.seq = super.getSubsequenceAt(contig, cache.start, cache.stop);
            // convert all of the bases in the sequence to upper case if we aren't preserving cases
            if (!preserveCase)
                StringUtil.toUpperCase(cache.seq.getBases());
            if (!preserveIUPAC)
                BaseUtils.convertIUPACtoN(cache.seq.getBases(), true, cache.start == 0);
        } else {
            cacheHits++;
        }
        // at this point we determine where in the cache we want to extract the requested subsequence
        final int cacheOffsetStart = (int) (start - cache.start);
        final int cacheOffsetStop = (int) (stop - start + cacheOffsetStart + 1);
        try {
            result = new ReferenceSequence(cache.seq.getName(), cache.seq.getContigIndex(), Arrays.copyOfRange(cache.seq.getBases(), cacheOffsetStart, cacheOffsetStop));
        } catch (ArrayIndexOutOfBoundsException e) {
            throw new GATKException(String.format("BUG: bad array indexing.  Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", cache.start, cache.stop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.seq.getBases().length), e);
        }
    }
    // for debugging -- print out our efficiency if requested
    if (PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0)
        printEfficiency(Level.INFO);
    return result;
}
Also used : SAMException(htsjdk.samtools.SAMException) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Example 13 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class AnnotateTargetsIntegrationTest method checkOutputGCContent.

private void checkOutputGCContent(final ReferenceFileSource reference, final SimpleInterval interval, final double observed) {
    final ReferenceSequence sequence = reference.queryAndPrefetch(interval);
    int total = 0;
    int gc = 0;
    for (final byte base : sequence.getBases()) {
        switch(Character.toLowerCase(base)) {
            case 'g':
            case 'c':
                gc++;
                total++;
                break;
            case 'a':
            case 't':
            case 'u':
                total++;
                break;
            default:
        }
    }
    final double expectedGCContent = total == 0 ? Double.NaN : ((double) gc / (double) total);
    if (Double.isNaN(expectedGCContent)) {
        Assert.assertTrue(Double.isNaN(observed));
    } else {
        Assert.assertEquals(observed, expectedGCContent, 0.0001);
    }
}
Also used : ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence)

Example 14 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class CachingIndexedFastaSequenceFileUnitTest method testSequential.

private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
    Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
    final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
    SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
    for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
        int start = i;
        int stop = start + querySize;
        if (stop <= contig.getSequenceLength()) {
            ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
            ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
            Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
            Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
            Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
        }
    }
    // asserts for efficiency.  We are going to make contig.length / STEP_SIZE queries
    // at each of range: start -> start + querySize against a cache with size of X.
    // we expect to hit the cache each time range falls within X.  We expect a hit
    // on the cache if range is within X.  Which should happen at least (X - query_size * 2) / STEP_SIZE
    // times.
    final int minExpectedHits = (int) Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
    caching.printEfficiency(Level.WARN);
    Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile)

Aggregations

ReferenceSequence (htsjdk.samtools.reference.ReferenceSequence)14 ReferenceSequenceFile (htsjdk.samtools.reference.ReferenceSequenceFile)5 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)3 SamReader (htsjdk.samtools.SamReader)3 ReferenceSequenceFileWalker (htsjdk.samtools.reference.ReferenceSequenceFileWalker)3 File (java.io.File)3 UserException (org.broadinstitute.hellbender.exceptions.UserException)3 ProgressLogger (org.broadinstitute.hellbender.utils.runtime.ProgressLogger)3 SAMRecord (htsjdk.samtools.SAMRecord)2 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)2 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)2 Test (org.testng.annotations.Test)2 SAMException (htsjdk.samtools.SAMException)1 SortOrder (htsjdk.samtools.SAMFileHeader.SortOrder)1 SamRecordFilter (htsjdk.samtools.filter.SamRecordFilter)1 SecondaryAlignmentFilter (htsjdk.samtools.filter.SecondaryAlignmentFilter)1 MetricsFile (htsjdk.samtools.metrics.MetricsFile)1