use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class SVUtils method getRefRDD.
/**
* Create an RDD from the reference sequences.
* The reference sequences are transformed into a single, large collection of byte arrays. The collection is then
* parallelized into an RDD.
* Each contig that exceeds a size given by REF_RECORD_LEN is broken into a series of REF_RECORD_LEN chunks with a
* K-1 base overlap between successive chunks. (I.e., for K=63, the last 62 bases in chunk n match the first 62
* bases in chunk n+1) so that we don't miss any kmers due to the chunking -- we can just kmerize each record
* independently.
*/
public static JavaRDD<byte[]> getRefRDD(final JavaSparkContext ctx, final int kSize, final ReferenceMultiSource ref, final PipelineOptions options, final SAMSequenceDictionary readsDict, final int ref_record_len, final int ref_records_per_partition) {
final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(readsDict);
if (dict == null)
throw new GATKException("No reference dictionary available");
final int effectiveRecLen = ref_record_len - kSize + 1;
final List<byte[]> sequenceChunks = new ArrayList<>();
for (final SAMSequenceRecord rec : dict.getSequences()) {
final String seqName = rec.getSequenceName();
final int seqLen = rec.getSequenceLength();
final SimpleInterval interval = new SimpleInterval(seqName, 1, seqLen);
try {
final byte[] bases = ref.getReferenceBases(options, interval).getBases();
for (int start = 0; start < seqLen; start += effectiveRecLen) {
sequenceChunks.add(Arrays.copyOfRange(bases, start, Math.min(start + ref_record_len, seqLen)));
}
} catch (final IOException ioe) {
throw new GATKException("Can't get reference sequence bases for " + interval, ioe);
}
}
return ctx.parallelize(sequenceChunks, sequenceChunks.size() / ref_records_per_partition + 1);
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class GenomeLocParserUnitTest method testParseUnknownSequenceLength.
@Test
public void testParseUnknownSequenceLength() {
SAMSequenceDictionary seqDict = new SAMSequenceDictionary();
seqDict.addSequence(new SAMSequenceRecord("1", SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH));
Assert.assertEquals(seqDict.getSequence("1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
GenomeLocParser myLocParser = new GenomeLocParser(seqDict);
GenomeLoc genomeLoc = myLocParser.parseGenomeLoc("1:1-99");
Assert.assertEquals(genomeLoc.getEnd(), 99);
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class GenomeLocParserUnitTest method testcreateGenomeLocOnContig.
@Test
public void testcreateGenomeLocOnContig() throws FileNotFoundException {
final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(exampleReference));
final SAMSequenceDictionary dict = seq.getSequenceDictionary();
final GenomeLocParser genomeLocParser = new GenomeLocParser(dict);
for (final SAMSequenceRecord rec : dict.getSequences()) {
final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName());
Assert.assertEquals(loc.getContig(), rec.getSequenceName());
Assert.assertEquals(loc.getStart(), 1);
Assert.assertEquals(loc.getStop(), rec.getSequenceLength());
}
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class GenomeLocParserUnitTest method testContigHasColon.
@Test
public void testContigHasColon() {
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
SAMSequenceDictionary dict = new SAMSequenceDictionary();
SAMSequenceRecord rec = new SAMSequenceRecord("c:h:r1", 10);
rec.setSequenceLength(10);
dict.addSequence(rec);
header.setSequenceDictionary(dict);
final GenomeLocParser myGenomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
GenomeLoc loc = myGenomeLocParser.parseGenomeLoc("c:h:r1:4-5");
assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStart(), 4);
assertEquals(loc.getStop(), 5);
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class MultiVariantDataSource method validateAllSequenceDictionaries.
/**
* GATKTool only validates individual feature dictionaries against the reference dictionary, so cross-validate
* all of the dictionaries against each other here by ensuring that each contig found in any dictionary has the
* same length (and md5, when a value is present for that contig in both dictionaries) in every other dictionary
* in which its present.
*/
private void validateAllSequenceDictionaries() {
final Map<String, FeatureDataSource<VariantContext>> contigMap = new HashMap<>();
featureDataSources.forEach(ds -> {
final SAMSequenceDictionary dictionary = ds.getSequenceDictionary();
if (dictionary == null) {
logger.warn("A sequence dictionary is required for each input when using multiple inputs, and one could" + " not be obtained for feature input: " + ds.getName() + ". The input may not exist or may not have a valid header");
} else {
dictionary.getSequences().forEach(sourceSequence -> {
final String sourceSequenceName = sourceSequence.getSequenceName();
final FeatureDataSource<VariantContext> previousDataSource = contigMap.getOrDefault(sourceSequenceName, null);
if (previousDataSource != null) {
final SAMSequenceDictionary previousDictionary = previousDataSource.getSequenceDictionary();
final SAMSequenceRecord previousSequence = previousDictionary.getSequence(sourceSequenceName);
validateSequenceDictionaryRecords(ds.getName(), dictionary, sourceSequence, previousDataSource.getName(), previousDictionary, previousSequence);
} else {
contigMap.put(sourceSequenceName, ds);
}
});
}
});
}
Aggregations