Search in sources :

Example 1 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class SVUtils method getRefRDD.

/**
     * Create an RDD from the reference sequences.
     * The reference sequences are transformed into a single, large collection of byte arrays. The collection is then
     * parallelized into an RDD.
     * Each contig that exceeds a size given by REF_RECORD_LEN is broken into a series of REF_RECORD_LEN chunks with a
     * K-1 base overlap between successive chunks. (I.e., for K=63, the last 62 bases in chunk n match the first 62
     * bases in chunk n+1) so that we don't miss any kmers due to the chunking -- we can just kmerize each record
     * independently.
     */
public static JavaRDD<byte[]> getRefRDD(final JavaSparkContext ctx, final int kSize, final ReferenceMultiSource ref, final PipelineOptions options, final SAMSequenceDictionary readsDict, final int ref_record_len, final int ref_records_per_partition) {
    final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(readsDict);
    if (dict == null)
        throw new GATKException("No reference dictionary available");
    final int effectiveRecLen = ref_record_len - kSize + 1;
    final List<byte[]> sequenceChunks = new ArrayList<>();
    for (final SAMSequenceRecord rec : dict.getSequences()) {
        final String seqName = rec.getSequenceName();
        final int seqLen = rec.getSequenceLength();
        final SimpleInterval interval = new SimpleInterval(seqName, 1, seqLen);
        try {
            final byte[] bases = ref.getReferenceBases(options, interval).getBases();
            for (int start = 0; start < seqLen; start += effectiveRecLen) {
                sequenceChunks.add(Arrays.copyOfRange(bases, start, Math.min(start + ref_record_len, seqLen)));
            }
        } catch (final IOException ioe) {
            throw new GATKException("Can't get reference sequence bases for " + interval, ioe);
        }
    }
    return ctx.parallelize(sequenceChunks, sequenceChunks.size() / ref_records_per_partition + 1);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 2 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class GenomeLocParserUnitTest method testParseUnknownSequenceLength.

@Test
public void testParseUnknownSequenceLength() {
    SAMSequenceDictionary seqDict = new SAMSequenceDictionary();
    seqDict.addSequence(new SAMSequenceRecord("1", SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH));
    Assert.assertEquals(seqDict.getSequence("1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
    GenomeLocParser myLocParser = new GenomeLocParser(seqDict);
    GenomeLoc genomeLoc = myLocParser.parseGenomeLoc("1:1-99");
    Assert.assertEquals(genomeLoc.getEnd(), 99);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 3 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class GenomeLocParserUnitTest method testcreateGenomeLocOnContig.

@Test
public void testcreateGenomeLocOnContig() throws FileNotFoundException {
    final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(exampleReference));
    final SAMSequenceDictionary dict = seq.getSequenceDictionary();
    final GenomeLocParser genomeLocParser = new GenomeLocParser(dict);
    for (final SAMSequenceRecord rec : dict.getSequences()) {
        final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName());
        Assert.assertEquals(loc.getContig(), rec.getSequenceName());
        Assert.assertEquals(loc.getStart(), 1);
        Assert.assertEquals(loc.getStop(), rec.getSequenceLength());
    }
}
Also used : CachingIndexedFastaSequenceFile(org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) CachingIndexedFastaSequenceFile(org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile) File(java.io.File) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 4 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class GenomeLocParserUnitTest method testContigHasColon.

@Test
public void testContigHasColon() {
    SAMFileHeader header = new SAMFileHeader();
    header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
    SAMSequenceDictionary dict = new SAMSequenceDictionary();
    SAMSequenceRecord rec = new SAMSequenceRecord("c:h:r1", 10);
    rec.setSequenceLength(10);
    dict.addSequence(rec);
    header.setSequenceDictionary(dict);
    final GenomeLocParser myGenomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
    GenomeLoc loc = myGenomeLocParser.parseGenomeLoc("c:h:r1:4-5");
    assertEquals(0, loc.getContigIndex());
    assertEquals(loc.getStart(), 4);
    assertEquals(loc.getStop(), 5);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 5 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class MultiVariantDataSource method validateAllSequenceDictionaries.

/**
     * GATKTool only validates individual feature dictionaries against the reference dictionary, so cross-validate
     * all of the dictionaries against each other here by ensuring that each contig found in any dictionary has the
     * same length (and md5, when a value is present for that contig in both dictionaries) in every other dictionary
     * in which its present.
     */
private void validateAllSequenceDictionaries() {
    final Map<String, FeatureDataSource<VariantContext>> contigMap = new HashMap<>();
    featureDataSources.forEach(ds -> {
        final SAMSequenceDictionary dictionary = ds.getSequenceDictionary();
        if (dictionary == null) {
            logger.warn("A sequence dictionary is required for each input when using multiple inputs, and one could" + " not be obtained for feature input: " + ds.getName() + ". The input may not exist or may not have a valid header");
        } else {
            dictionary.getSequences().forEach(sourceSequence -> {
                final String sourceSequenceName = sourceSequence.getSequenceName();
                final FeatureDataSource<VariantContext> previousDataSource = contigMap.getOrDefault(sourceSequenceName, null);
                if (previousDataSource != null) {
                    final SAMSequenceDictionary previousDictionary = previousDataSource.getSequenceDictionary();
                    final SAMSequenceRecord previousSequence = previousDictionary.getSequence(sourceSequenceName);
                    validateSequenceDictionaryRecords(ds.getName(), dictionary, sourceSequence, previousDataSource.getName(), previousDictionary, previousSequence);
                } else {
                    contigMap.put(sourceSequenceName, ds);
                }
            });
        }
    });
}
Also used : VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Aggregations

SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)72 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)35 Test (org.testng.annotations.Test)26 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)24 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)13 File (java.io.File)10 SAMFileHeader (htsjdk.samtools.SAMFileHeader)9 UserException (org.broadinstitute.hellbender.exceptions.UserException)8 DataProvider (org.testng.annotations.DataProvider)8 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)7 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 QueryInterval (htsjdk.samtools.QueryInterval)5 Allele (htsjdk.variant.variantcontext.Allele)4 VariantContext (htsjdk.variant.variantcontext.VariantContext)4 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)4 java.util (java.util)4 Collectors (java.util.stream.Collectors)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4