use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.
the class CachingIndexedFastaSequenceFile method checkAndCreate.
/**
* Create reference data source from fasta file, after performing several preliminary checks on the file.
* This static utility was refactored from the constructor of ReferenceDataSource.
* Possibly may be better as an overloaded constructor.
* @param fastaFile Fasta file to be used as reference
* @return A new instance of a CachingIndexedFastaSequenceFile.
*/
public static CachingIndexedFastaSequenceFile checkAndCreate(final File fastaFile) {
// does the fasta file exist? check that first...
if (!fastaFile.exists()) {
throw new UserException.MissingReference("The specified fasta file (" + fastaFile.getAbsolutePath() + ") does not exist.");
}
final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz");
if (isGzipped) {
throw new UserException.CannotHandleGzippedRef();
}
final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
// determine the name for the dict file
// TODO: use the htsjdk method implemented in https://github.com/samtools/htsjdk/pull/774
final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? "\\.fa$" : "\\.fasta$";
final File dictFile = new File(fastaFile.getAbsolutePath().replaceAll(fastaExt, IOUtil.DICT_FILE_EXTENSION));
// for creating these files.
if (!indexFile.exists()) {
throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile);
}
if (!dictFile.exists()) {
throw new UserException.MissingReferenceDictFile(dictFile, fastaFile);
}
// Read reference data by creating an IndexedFastaSequenceFile.
try {
return new CachingIndexedFastaSequenceFile(fastaFile);
} catch (IllegalArgumentException e) {
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
} catch (Exception e) {
throw new UserException.CouldNotReadInputFile(fastaFile, e);
}
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.
the class IntervalUtilsUnitTest method invalidIntervalDataProvider.
@DataProvider(name = "invalidIntervalTestData")
public Object[][] invalidIntervalDataProvider() throws Exception {
File fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
return new Object[][] { new Object[] { genomeLocParser, "1", 10000000, 20000000 }, new Object[] { genomeLocParser, "2", 1, 2 }, new Object[] { genomeLocParser, "1", -1, 50 } };
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.
the class IntervalUtilsUnitTest method negativeOneLengthIntervalDataProvider.
// TODO - remove once a corrected version of the exome interval list is released.
@DataProvider(name = "negativeOneLengthIntervalTestData")
public Object[][] negativeOneLengthIntervalDataProvider() throws Exception {
File fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
return new Object[][] { new Object[] { genomeLocParser, "1", 2, 1 }, new Object[] { genomeLocParser, "1", 500, 499 }, new Object[] { genomeLocParser, "1", 1000, 999 } };
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.
the class CachingIndexedFastaSequenceFileUnitTest method testSequential.
private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
int start = i;
int stop = start + querySize;
if (stop <= contig.getSequenceLength()) {
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
}
}
// asserts for efficiency. We are going to make contig.length / STEP_SIZE queries
// at each of range: start -> start + querySize against a cache with size of X.
// we expect to hit the cache each time range falls within X. We expect a hit
// on the cache if range is within X. Which should happen at least (X - query_size * 2) / STEP_SIZE
// times.
final int minExpectedHits = (int) Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
caching.printEfficiency(Level.WARN);
Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gatk by broadinstitute.
the class GATKToolUnitTest method testBestSequenceDictionary_fromReadsAndReference.
@Test
public void testBestSequenceDictionary_fromReadsAndReference() throws Exception {
final GATKTool tool = new TestGATKToolWithReads();
final CommandLineParser clp = new CommandLineArgumentParser(tool);
final File bamFile = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
final String fastaFile = hg19MiniReference;
final String[] args = { "-I", bamFile.getCanonicalPath(), "-R", fastaFile };
clp.parseArguments(System.out, args);
tool.onStartup();
//read the dict back in and compare to reference dict
final SAMSequenceDictionary toolDict = tool.getBestAvailableSequenceDictionary();
final SAMSequenceDictionary fastaDict = new IndexedFastaSequenceFile(new File(fastaFile)).getSequenceDictionary();
toolDict.assertSameDictionary(fastaDict);
fastaDict.assertSameDictionary(toolDict);
Assert.assertEquals(toolDict, fastaDict);
}
Aggregations