use of htsjdk.samtools.QueryInterval in project gatk by broadinstitute.
the class IntervalUtilsUnitTest method testConvertSimpleIntervalToQueryInterval.
@Test
public void testConvertSimpleIntervalToQueryInterval() {
final SAMSequenceRecord contigRecord = new SAMSequenceRecord("1", 100);
final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Arrays.asList(contigRecord));
final SimpleInterval originalInterval = new SimpleInterval("1", 5, 10);
final QueryInterval convertedInterval = IntervalUtils.convertSimpleIntervalToQueryInterval(originalInterval, dictionary);
Assert.assertEquals(convertedInterval.referenceIndex, 0);
Assert.assertEquals(convertedInterval.start, 5);
Assert.assertEquals(convertedInterval.end, 10);
}
use of htsjdk.samtools.QueryInterval in project gatk by broadinstitute.
the class NioBam method getReads.
/** Parses the BAM file into SAMRecords. Will be distributed onto at least 'numPartitions' partitions. **/
public JavaRDD<SAMRecord> getReads(JavaSparkContext ctx, int numPartitions) {
try {
Path bamPath = IOUtils.getPath(bam);
ChannelAsSeekableStream bamOverNIO = new ChannelAsSeekableStream(Files.newByteChannel(bamPath), bamPath.toString());
final byte[] index = getIndex();
SeekableStream indexInMemory = new ByteArraySeekableStream(index);
SamReader bam3 = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT).enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES).open(SamInputResource.of(bamOverNIO).index(indexInMemory));
List<QueryInterval> chunks = getAllChunksBalanced(bam3, numPartitions);
// Ideally we'd get exactly the number of chunks the user is asking for, but until then...
logger.debug("We got: " + chunks.size() + " chunks.");
return ctx.parallelize(chunks, chunks.size()).flatMap(qi -> new ReadsIterable(bam, index, qi).iterator());
} catch (IOException e) {
throw new GATKException("I/O error loading reads", e);
}
}
use of htsjdk.samtools.QueryInterval in project gatk by broadinstitute.
the class NioBam method getChunksBalanced.
private static List<QueryInterval> getChunksBalanced(SamReader bam, int sequenceIndex, int retCount) {
List<QueryInterval> ret = new ArrayList<>();
BAMIndex index = bam.indexing().getIndex();
SAMFileHeader header = bam.getFileHeader();
SAMSequenceRecord s = header.getSequence(sequenceIndex);
long totalLength = chunksLength(getChunks(index, sequenceIndex, 1, s.getSequenceLength() + 1));
if (totalLength == 0) {
return ret;
}
int sofar = 0;
long targetLength = totalLength / retCount;
int end = s.getSequenceLength();
int step = s.getSequenceLength() / (100 * retCount);
if (step < 1)
step = 1;
int start = 1;
for (int j = step; j < end; j += step) {
if (j > end)
j = end;
List<Chunk> candidate = getChunks(index, sequenceIndex, start, j);
long size = chunksLength(candidate);
if (size < targetLength) {
// not big enough yet
continue;
}
if (size > targetLength * 2) {
// too large, search for a good separation point
// TODO
}
// good, emit.
ret.add(new QueryInterval(sequenceIndex, start, j + 1));
start = j;
sofar += size;
if (ret.size() < retCount) {
targetLength = (totalLength - sofar) / (retCount - ret.size());
} else {
targetLength = totalLength / retCount;
}
}
return ret;
}
use of htsjdk.samtools.QueryInterval in project gatk by broadinstitute.
the class NioBam method getAllChunksBalanced.
// this isn't very good yet, ideally we want just this number of query intervals, not per-contig.
private static List<QueryInterval> getAllChunksBalanced(SamReader bam, int countPerContig) {
List<QueryInterval> ret = new ArrayList<>();
SAMFileHeader header = bam.getFileHeader();
for (SAMSequenceRecord s : header.getSequenceDictionary().getSequences()) {
ret.addAll(getChunksBalanced(bam, s.getSequenceIndex(), countPerContig));
}
return ret;
}
Aggregations