Search in sources :

Example 41 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class GenomeLocParser method createGenomeLocAtStop.

/**
     * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
     * @param loc The original loc
     * @param maxBasePairs The maximum number of basePairs
     * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
     */
public GenomeLoc createGenomeLocAtStop(final GenomeLoc loc, final int maxBasePairs) {
    if (GenomeLoc.isUnmapped(loc))
        return null;
    String contigName = loc.getContig();
    SAMSequenceRecord contig = contigInfo.getSequence(contigName);
    int contigIndex = contig.getSequenceIndex();
    int contigLength = contig.getSequenceLength();
    int start = loc.getStop() + 1;
    int stop = loc.getStop() + maxBasePairs;
    if (start > contigLength)
        return null;
    if (stop > contigLength)
        stop = contigLength;
    return createGenomeLoc(contigName, contigIndex, start, stop, true);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord)

Example 42 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class GenomeLocSortedSet method createSetFromSequenceDictionary.

/**
     * create a list of genomic locations, given a reference sequence
     *
     * @param dict the sequence dictionary to create a collection from
     *
     * @return the GenomeLocSet of all references sequences as GenomeLoc's
     */
public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) {
    final GenomeLocParser parser = new GenomeLocParser(dict);
    final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
    for (final SAMSequenceRecord sequence : dict.getSequences()) {
        returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName()));
    }
    return returnSortedSet;
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord)

Example 43 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class ReferenceMemorySourceTest method init.

@BeforeTest
public void init() {
    List<SAMSequenceRecord> l = new ArrayList<>();
    l.add(new SAMSequenceRecord("chr1", 1000000));
    SAMSequenceDictionary seqDir = new SAMSequenceDictionary(l);
    memorySource = new ReferenceMemorySource(ref1, seqDir);
}
Also used : ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BeforeTest(org.testng.annotations.BeforeTest)

Example 44 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class ReorderSam method buildSequenceDictionaryMap.

/**
     * Constructs a mapping from read sequence records index -> new sequence dictionary index for use in
     * reordering the reference index and mate reference index in each read.  -1 means unmapped.
     */
private Map<Integer, Integer> buildSequenceDictionaryMap(final SAMSequenceDictionary refDict, final SAMSequenceDictionary readsDict) {
    Map<Integer, Integer> newOrder = new HashMap<>();
    logger.info("Reordering SAM/BAM file:");
    for (final SAMSequenceRecord refRec : refDict.getSequences()) {
        final SAMSequenceRecord readsRec = readsDict.getSequence(refRec.getSequenceName());
        if (readsRec != null) {
            if (refRec.getSequenceLength() != readsRec.getSequenceLength()) {
                String msg = String.format("Discordant contig lengths: read %s LN=%d, ref %s LN=%d", readsRec.getSequenceName(), readsRec.getSequenceLength(), refRec.getSequenceName(), refRec.getSequenceLength());
                if (ALLOW_CONTIG_LENGTH_DISCORDANCE) {
                    logger.warn(msg);
                } else {
                    throw new UserException(msg);
                }
            }
            logger.info(String.format("  Reordering read contig %s [index=%d] to => ref contig %s [index=%d]%n", readsRec.getSequenceName(), readsRec.getSequenceIndex(), refRec.getSequenceName(), refRec.getSequenceIndex()));
            newOrder.put(readsRec.getSequenceIndex(), refRec.getSequenceIndex());
        }
    }
    for (SAMSequenceRecord readsRec : readsDict.getSequences()) {
        if (!newOrder.containsKey(readsRec.getSequenceIndex())) {
            if (ALLOW_INCOMPLETE_DICT_CONCORDANCE)
                newOrder.put(readsRec.getSequenceIndex(), -1);
            else
                throw new UserException("New reference sequence does not contain a matching contig for " + readsRec.getSequenceName());
        }
    }
    return newOrder;
}
Also used : HashMap(java.util.HashMap) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Example 45 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class CachingIndexedFastaSequenceFile method getSubsequenceAt.

/**
     * Gets the subsequence of the contig in the range [start,stop]
     *
     * Uses the sequence cache if possible, or updates the cache to handle the request.  If the range
     * is larger than the cache itself, just loads the sequence directly, not changing the cache at all
     *
     * @param contig Contig whose subsequence to retrieve.
     * @param start inclusive, 1-based start of region.
     * @param stop inclusive, 1-based stop of region.
     * @return The partial reference sequence associated with this range.  If preserveCase is false, then
     *         all of the bases in the ReferenceSequence returned by this method will be upper cased.
     */
@Override
public ReferenceSequence getSubsequenceAt(final String contig, long start, final long stop) {
    final ReferenceSequence result;
    if ((stop - start) >= cacheSize) {
        cacheMisses++;
        result = super.getSubsequenceAt(contig, start, stop);
        if (!preserveCase)
            StringUtil.toUpperCase(result.getBases());
        if (!preserveIUPAC)
            BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
    } else {
        // todo -- potential optimization is to check if contig.name == contig, as this in general will be true
        SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
        if (contigInfo == null) {
            throw new UserException.MissingContigInSequenceDictionary(contig, super.getSequenceDictionary());
        }
        if (stop > contigInfo.getSequenceLength())
            throw new SAMException("Query asks for data past end of contig. Query contig " + contig + " start:" + start + " stop:" + stop + " contigLength:" + contigInfo.getSequenceLength());
        if (start < cache.start || stop > cache.stop || cache.seq == null || cache.seq.getContigIndex() != contigInfo.getSequenceIndex()) {
            cacheMisses++;
            cache.start = Math.max(start - cacheMissBackup, 0);
            cache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength());
            cache.seq = super.getSubsequenceAt(contig, cache.start, cache.stop);
            // convert all of the bases in the sequence to upper case if we aren't preserving cases
            if (!preserveCase)
                StringUtil.toUpperCase(cache.seq.getBases());
            if (!preserveIUPAC)
                BaseUtils.convertIUPACtoN(cache.seq.getBases(), true, cache.start == 0);
        } else {
            cacheHits++;
        }
        // at this point we determine where in the cache we want to extract the requested subsequence
        final int cacheOffsetStart = (int) (start - cache.start);
        final int cacheOffsetStop = (int) (stop - start + cacheOffsetStart + 1);
        try {
            result = new ReferenceSequence(cache.seq.getName(), cache.seq.getContigIndex(), Arrays.copyOfRange(cache.seq.getBases(), cacheOffsetStart, cacheOffsetStop));
        } catch (ArrayIndexOutOfBoundsException e) {
            throw new GATKException(String.format("BUG: bad array indexing.  Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", cache.start, cache.stop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.seq.getBases().length), e);
        }
    }
    // for debugging -- print out our efficiency if requested
    if (PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0)
        printEfficiency(Level.INFO);
    return result;
}
Also used : SAMException(htsjdk.samtools.SAMException) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Aggregations

SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)72 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)35 Test (org.testng.annotations.Test)26 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)24 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)13 File (java.io.File)10 SAMFileHeader (htsjdk.samtools.SAMFileHeader)9 UserException (org.broadinstitute.hellbender.exceptions.UserException)8 DataProvider (org.testng.annotations.DataProvider)8 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)7 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 QueryInterval (htsjdk.samtools.QueryInterval)5 Allele (htsjdk.variant.variantcontext.Allele)4 VariantContext (htsjdk.variant.variantcontext.VariantContext)4 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)4 java.util (java.util)4 Collectors (java.util.stream.Collectors)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4