Search in sources :

Example 26 with IndexedFastaSequenceFile

use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.

the class CachingIndexedFastaSequenceFile method checkAndCreate.

/**
     * Create reference data source from fasta file, after performing several preliminary checks on the file.
     * This static utility was refactored from the constructor of ReferenceDataSource.
     * Possibly may be better as an overloaded constructor.
     * @param fastaFile Fasta file to be used as reference
     * @return A new instance of a CachingIndexedFastaSequenceFile.
     */
public static CachingIndexedFastaSequenceFile checkAndCreate(final File fastaFile) {
    // does the fasta file exist? check that first...
    if (!fastaFile.exists()) {
        throw new UserException.MissingReference("The specified fasta file (" + fastaFile.getAbsolutePath() + ") does not exist.");
    }
    final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz");
    if (isGzipped) {
        throw new UserException.CannotHandleGzippedRef();
    }
    final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
    // determine the name for the dict file
    // TODO: use the htsjdk method implemented in https://github.com/samtools/htsjdk/pull/774
    final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? "\\.fa$" : "\\.fasta$";
    final File dictFile = new File(fastaFile.getAbsolutePath().replaceAll(fastaExt, IOUtil.DICT_FILE_EXTENSION));
    // for creating these files.
    if (!indexFile.exists()) {
        throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile);
    }
    if (!dictFile.exists()) {
        throw new UserException.MissingReferenceDictFile(dictFile, fastaFile);
    }
    // Read reference data by creating an IndexedFastaSequenceFile.
    try {
        return new CachingIndexedFastaSequenceFile(fastaFile);
    } catch (IllegalArgumentException e) {
        throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence.  The FASTA must have either a .fasta or .fa extension", e);
    } catch (Exception e) {
        throw new UserException.CouldNotReadInputFile(fastaFile, e);
    }
}
Also used : UserException(org.broadinstitute.hellbender.exceptions.UserException) File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SAMException(htsjdk.samtools.SAMException) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) FileNotFoundException(java.io.FileNotFoundException) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Example 27 with IndexedFastaSequenceFile

use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.

the class IntervalUtilsUnitTest method invalidIntervalDataProvider.

@DataProvider(name = "invalidIntervalTestData")
public Object[][] invalidIntervalDataProvider() throws Exception {
    File fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
    GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
    return new Object[][] { new Object[] { genomeLocParser, "1", 10000000, 20000000 }, new Object[] { genomeLocParser, "2", 1, 2 }, new Object[] { genomeLocParser, "1", -1, 50 } };
}
Also used : File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) DataProvider(org.testng.annotations.DataProvider)

Example 28 with IndexedFastaSequenceFile

use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.

the class IntervalUtilsUnitTest method negativeOneLengthIntervalDataProvider.

// TODO - remove once a corrected version of the exome interval list is released.
@DataProvider(name = "negativeOneLengthIntervalTestData")
public Object[][] negativeOneLengthIntervalDataProvider() throws Exception {
    File fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
    GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
    return new Object[][] { new Object[] { genomeLocParser, "1", 2, 1 }, new Object[] { genomeLocParser, "1", 500, 499 }, new Object[] { genomeLocParser, "1", 1000, 999 } };
}
Also used : File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) DataProvider(org.testng.annotations.DataProvider)

Example 29 with IndexedFastaSequenceFile

use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.

the class CachingIndexedFastaSequenceFileUnitTest method testSequential.

private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
    Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
    final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
    SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
    for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
        int start = i;
        int stop = start + querySize;
        if (stop <= contig.getSequenceLength()) {
            ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
            ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
            Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
            Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
            Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
        }
    }
    // asserts for efficiency.  We are going to make contig.length / STEP_SIZE queries
    // at each of range: start -> start + querySize against a cache with size of X.
    // we expect to hit the cache each time range falls within X.  We expect a hit
    // on the cache if range is within X.  Which should happen at least (X - query_size * 2) / STEP_SIZE
    // times.
    final int minExpectedHits = (int) Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
    caching.printEfficiency(Level.WARN);
    Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile)

Example 30 with IndexedFastaSequenceFile

use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.

the class GATKToolUnitTest method testBestSequenceDictionary_fromReadsAndReference.

@Test
public void testBestSequenceDictionary_fromReadsAndReference() throws Exception {
    final GATKTool tool = new TestGATKToolWithReads();
    final CommandLineParser clp = new CommandLineArgumentParser(tool);
    final File bamFile = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
    final String fastaFile = hg19MiniReference;
    final String[] args = { "-I", bamFile.getCanonicalPath(), "-R", fastaFile };
    clp.parseArguments(System.out, args);
    tool.onStartup();
    //read the dict back in and compare to reference dict
    final SAMSequenceDictionary toolDict = tool.getBestAvailableSequenceDictionary();
    final SAMSequenceDictionary fastaDict = new IndexedFastaSequenceFile(new File(fastaFile)).getSequenceDictionary();
    toolDict.assertSameDictionary(fastaDict);
    fastaDict.assertSameDictionary(toolDict);
    Assert.assertEquals(toolDict, fastaDict);
}
Also used : CommandLineArgumentParser(org.broadinstitute.barclay.argparser.CommandLineArgumentParser) CommandLineParser(org.broadinstitute.barclay.argparser.CommandLineParser) File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)57 File (java.io.File)34 SamReader (htsjdk.samtools.SamReader)22 SAMRecord (htsjdk.samtools.SAMRecord)20 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)16 SAMFileHeader (htsjdk.samtools.SAMFileHeader)16 ArrayList (java.util.ArrayList)16 IOException (java.io.IOException)15 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)14 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)13 SamReaderFactory (htsjdk.samtools.SamReaderFactory)12 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)11 CigarElement (htsjdk.samtools.CigarElement)11 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)11 List (java.util.List)11 FileNotFoundException (java.io.FileNotFoundException)10 BufferedReader (java.io.BufferedReader)9 Collectors (java.util.stream.Collectors)9 Cigar (htsjdk.samtools.Cigar)8 CigarOperator (htsjdk.samtools.CigarOperator)7