use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.
the class NormalizeFasta method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
Utils.validateArg(!INPUT.getAbsoluteFile().equals(OUTPUT.getAbsoluteFile()), "Input and output cannot be the same file.");
try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(INPUT, TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE);
final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT)) {
ReferenceSequence seq = null;
while ((seq = ref.nextSequence()) != null) {
final String name = seq.getName();
final byte[] bases = seq.getBases();
try {
out.write(">");
out.write(name);
out.newLine();
if (bases.length == 0) {
logger.warn("Sequence " + name + " contains 0 bases.");
} else {
for (int i = 0; i < bases.length; ++i) {
if (i > 0 && i % LINE_LENGTH == 0)
out.write("\n");
out.write(bases[i]);
}
out.write("\n");
}
} catch (IOException ioe) {
throw new RuntimeIOException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);
}
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return null;
}
use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.
the class CachingIndexedFastaSequenceFile method getSubsequenceAt.
/**
* Gets the subsequence of the contig in the range [start,stop]
*
* Uses the sequence cache if possible, or updates the cache to handle the request. If the range
* is larger than the cache itself, just loads the sequence directly, not changing the cache at all
*
* @param contig Contig whose subsequence to retrieve.
* @param start inclusive, 1-based start of region.
* @param stop inclusive, 1-based stop of region.
* @return The partial reference sequence associated with this range. If preserveCase is false, then
* all of the bases in the ReferenceSequence returned by this method will be upper cased.
*/
@Override
public ReferenceSequence getSubsequenceAt(final String contig, long start, final long stop) {
final ReferenceSequence result;
if ((stop - start) >= cacheSize) {
cacheMisses++;
result = super.getSubsequenceAt(contig, start, stop);
if (!preserveCase)
StringUtil.toUpperCase(result.getBases());
if (!preserveIUPAC)
BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
} else {
// todo -- potential optimization is to check if contig.name == contig, as this in general will be true
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
if (contigInfo == null) {
throw new UserException.MissingContigInSequenceDictionary(contig, super.getSequenceDictionary());
}
if (stop > contigInfo.getSequenceLength())
throw new SAMException("Query asks for data past end of contig. Query contig " + contig + " start:" + start + " stop:" + stop + " contigLength:" + contigInfo.getSequenceLength());
if (start < cache.start || stop > cache.stop || cache.seq == null || cache.seq.getContigIndex() != contigInfo.getSequenceIndex()) {
cacheMisses++;
cache.start = Math.max(start - cacheMissBackup, 0);
cache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength());
cache.seq = super.getSubsequenceAt(contig, cache.start, cache.stop);
// convert all of the bases in the sequence to upper case if we aren't preserving cases
if (!preserveCase)
StringUtil.toUpperCase(cache.seq.getBases());
if (!preserveIUPAC)
BaseUtils.convertIUPACtoN(cache.seq.getBases(), true, cache.start == 0);
} else {
cacheHits++;
}
// at this point we determine where in the cache we want to extract the requested subsequence
final int cacheOffsetStart = (int) (start - cache.start);
final int cacheOffsetStop = (int) (stop - start + cacheOffsetStart + 1);
try {
result = new ReferenceSequence(cache.seq.getName(), cache.seq.getContigIndex(), Arrays.copyOfRange(cache.seq.getBases(), cacheOffsetStart, cacheOffsetStop));
} catch (ArrayIndexOutOfBoundsException e) {
throw new GATKException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", cache.start, cache.stop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.seq.getBases().length), e);
}
}
// for debugging -- print out our efficiency if requested
if (PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0)
printEfficiency(Level.INFO);
return result;
}
use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.
the class AnnotateTargetsIntegrationTest method checkOutputGCContent.
private void checkOutputGCContent(final ReferenceFileSource reference, final SimpleInterval interval, final double observed) {
final ReferenceSequence sequence = reference.queryAndPrefetch(interval);
int total = 0;
int gc = 0;
for (final byte base : sequence.getBases()) {
switch(Character.toLowerCase(base)) {
case 'g':
case 'c':
gc++;
total++;
break;
case 'a':
case 't':
case 'u':
total++;
break;
default:
}
}
final double expectedGCContent = total == 0 ? Double.NaN : ((double) gc / (double) total);
if (Double.isNaN(expectedGCContent)) {
Assert.assertTrue(Double.isNaN(observed));
} else {
Assert.assertEquals(observed, expectedGCContent, 0.0001);
}
}
use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.
the class CachingIndexedFastaSequenceFileUnitTest method testSequential.
private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
int start = i;
int stop = start + querySize;
if (stop <= contig.getSequenceLength()) {
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
}
}
// asserts for efficiency. We are going to make contig.length / STEP_SIZE queries
// at each of range: start -> start + querySize against a cache with size of X.
// we expect to hit the cache each time range falls within X. We expect a hit
// on the cache if range is within X. Which should happen at least (X - query_size * 2) / STEP_SIZE
// times.
final int minExpectedHits = (int) Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
caching.printEfficiency(Level.WARN);
Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());
}
Aggregations