Search in sources :

Example 11 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class RnaSeqMetricsCollector method makeIgnoredSequenceIndicesSet.

public static Set<Integer> makeIgnoredSequenceIndicesSet(final SAMFileHeader header, final Set<String> ignoredSequence) {
    final Set<Integer> ignoredSequenceIndices = new HashSet<>();
    for (final String sequenceName : ignoredSequence) {
        final SAMSequenceRecord sequenceRecord = header.getSequence(sequenceName);
        if (sequenceRecord == null) {
            throw new UserException("Unrecognized sequence " + sequenceName + " passed as argument to IGNORE_SEQUENCE");
        }
        ignoredSequenceIndices.add(sequenceRecord.getSequenceIndex());
    }
    return ignoredSequenceIndices;
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Example 12 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class BedToIntervalList method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
    IOUtil.assertFileIsWritable(OUTPUT);
    try {
        final SAMFileHeader header = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY);
        final IntervalList intervalList = new IntervalList(header);
        /**
             * NB: BED is zero-based, but a BEDCodec by default (since it is returns tribble Features) has an offset of one,
             * so it returns 1-based starts.  Ugh.  Set to zero.
             */
        final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(BEDCodec.StartOffset.ZERO), false);
        final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
        final ProgressLogger progressLogger = new ProgressLogger(logger, (int) 1e6);
        while (iterator.hasNext()) {
            final BEDFeature bedFeature = iterator.next();
            final String sequenceName = bedFeature.getContig();
            /**
                 * NB: BED is zero-based, so we need to add one here to make it one-based.  Please observe we set the start
                 * offset to zero when creating the BEDCodec.
                 */
            final int start = bedFeature.getStart() + 1;
            /**
                 * NB: BED is 0-based OPEN (which, for the end is equivalent to 1-based closed).
                 */
            final int end = bedFeature.getEnd();
            // NB: do not use an empty name within an interval
            String name = bedFeature.getName();
            if (name.isEmpty())
                name = null;
            final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);
            // Do some validation
            if (null == sequenceRecord) {
                throw new GATKException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
            } else if (start < 1) {
                throw new GATKException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
            } else if (sequenceRecord.getSequenceLength() < start) {
                throw new GATKException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
            } else if (end < 1) {
                throw new GATKException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
            } else if (sequenceRecord.getSequenceLength() < end) {
                throw new GATKException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
            } else if (end < start - 1) {
                throw new GATKException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
            }
            final Interval interval = new Interval(sequenceName, start, end, bedFeature.getStrand() == Strand.POSITIVE, name);
            intervalList.add(interval);
            progressLogger.record(sequenceName, start);
        }
        CloserUtil.close(bedReader);
        // Sort and write the output
        intervalList.uniqued().write(OUTPUT);
    } catch (final IOException e) {
        throw new RuntimeException(e);
    }
    return null;
}
Also used : ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IOException(java.io.IOException) IntervalList(htsjdk.samtools.util.IntervalList) BEDFeature(htsjdk.tribble.bed.BEDFeature) SAMFileHeader(htsjdk.samtools.SAMFileHeader) BEDCodec(htsjdk.tribble.bed.BEDCodec) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Interval(htsjdk.samtools.util.Interval)

Example 13 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class AddContextDataToReadSparkUnitTest method addContextDataTest.

@Test(dataProvider = "bases", groups = "spark")
public void addContextDataTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, ReadContextData>> expectedReadContextData, JoinStrategy joinStrategy) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    when(mockSource.getReferenceBases(any(PipelineOptions.class), any())).then(new ReferenceBasesAnswer());
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SAMSequenceDictionary sd = new SAMSequenceDictionary(Lists.newArrayList(new SAMSequenceRecord("1", 100000), new SAMSequenceRecord("2", 100000)));
    when(mockSource.getReferenceSequenceDictionary(null)).thenReturn(sd);
    JavaPairRDD<GATKRead, ReadContextData> rddActual = AddContextDataToReadSpark.add(ctx, rddReads, mockSource, rddVariants, joinStrategy, sd, 10000, 1000);
    Map<GATKRead, ReadContextData> actual = rddActual.collectAsMap();
    Assert.assertEquals(actual.size(), expectedReadContextData.size());
    for (KV<GATKRead, ReadContextData> kv : expectedReadContextData) {
        ReadContextData readContextData = actual.get(kv.getKey());
        Assert.assertNotNull(readContextData);
        Assert.assertTrue(CollectionUtils.isEqualCollection(Lists.newArrayList(readContextData.getOverlappingVariants()), Lists.newArrayList(kv.getValue().getOverlappingVariants())));
        SimpleInterval minimalInterval = kv.getValue().getOverlappingReferenceBases().getInterval();
        ReferenceBases subset = readContextData.getOverlappingReferenceBases().getSubset(minimalInterval);
        Assert.assertEquals(subset, kv.getValue().getOverlappingReferenceBases());
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 14 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class FindBadGenomicKmersSparkUnitTest method miniRefTest.

@Test(groups = "spark")
public void miniRefTest() throws IOException {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final ReferenceMultiSource ref = new ReferenceMultiSource((PipelineOptions) null, REFERENCE_FILE_NAME, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(null);
    if (dict == null)
        throw new GATKException("No reference dictionary available.");
    final Map<SVKmer, Long> kmerMap = new LinkedHashMap<>();
    for (final SAMSequenceRecord rec : dict.getSequences()) {
        final SimpleInterval interval = new SimpleInterval(rec.getSequenceName(), 1, rec.getSequenceLength());
        final byte[] bases = ref.getReferenceBases(null, interval).getBases();
        final SVKmerizer kmerizer = new SVKmerizer(bases, KMER_SIZE, new SVKmerLong());
        while (kmerizer.hasNext()) {
            final SVKmer kmer = kmerizer.next().canonical(KMER_SIZE);
            final Long currentCount = kmerMap.getOrDefault(kmer, 0L);
            kmerMap.put(kmer, currentCount + 1);
        }
    }
    final Iterator<Map.Entry<SVKmer, Long>> kmerIterator = kmerMap.entrySet().iterator();
    while (kmerIterator.hasNext()) {
        if (kmerIterator.next().getValue() <= FindBadGenomicKmersSpark.MAX_KMER_FREQ)
            kmerIterator.remove();
    }
    final List<SVKmer> badKmers = FindBadGenomicKmersSpark.findBadGenomicKmers(ctx, KMER_SIZE, Integer.MAX_VALUE, ref, null, null);
    final Set<SVKmer> badKmerSet = new HashSet<>(badKmers);
    Assert.assertEquals(badKmers.size(), badKmerSet.size());
    Assert.assertEquals(badKmerSet, kmerMap.keySet());
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 15 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class SATagBuilderUnitTests method testGetArtificalReadsBasedOnSATag.

@Test
public void testGetArtificalReadsBasedOnSATag() {
    // setup the record
    final SAMFileHeader header = new SAMFileHeader();
    header.addSequence(new SAMSequenceRecord("1", 1000));
    header.addSequence(new SAMSequenceRecord("2", 1000));
    GATKRead read = ArtificialReadUtils.createArtificialRead(header, "name", "2", 1, "AAAAAAAAAA".getBytes(), "##########".getBytes());
    read.setIsFirstOfPair();
    read.setCigar(TextCigarCodec.decode("10M"));
    read.setMatePosition("2", 1);
    //spec says first 'SA' record will be the primary record
    read.setIsSupplementaryAlignment(true);
    SATagBuilder builder = new SATagBuilder(read);
    // check no alignments if no SA tag */
    Assert.assertEquals(builder.getArtificialReadsBasedOnSATag(header).size(), 0);
    // Tests that parsing existing tags works properly
    read.setAttribute("SA", "2,500,+,3S2=1X2=2S,60,1;" + "1,191,-,8M2S,60,0;");
    builder = new SATagBuilder(read);
    List<GATKRead> suppl = builder.getArtificialReadsBasedOnSATag(header);
    Assert.assertNotNull(suppl);
    Assert.assertEquals(suppl.size(), 2);
    for (final GATKRead other : suppl) {
        Assert.assertFalse(other.isUnmapped());
        Assert.assertTrue(other.isPaired());
        Assert.assertFalse(other.mateIsUnmapped());
        Assert.assertEquals(other.getMateContig(), read.getMateContig());
        Assert.assertEquals(other.getName(), read.getName());
    }
    GATKRead other = suppl.get(0);
    //1st of suppl and 'record' is supplementary
    Assert.assertFalse(other.isSupplementaryAlignment());
    Assert.assertEquals(other.getContig(), "2");
    Assert.assertEquals(other.getStart(), 500);
    Assert.assertEquals(other.getMappingQuality(), 60);
    Assert.assertEquals((int) other.getAttributeAsInteger("NM"), 1);
    Assert.assertEquals(other.getCigar().toString(), "3S2=1X2=2S");
    other = suppl.get(1);
    Assert.assertTrue(other.isSupplementaryAlignment());
    Assert.assertEquals(other.getContig(), "1");
    Assert.assertEquals(other.getStart(), 191);
    Assert.assertTrue(other.isReverseStrand());
    Assert.assertEquals(other.getMappingQuality(), 60);
    Assert.assertEquals((int) other.getAttributeAsInteger("NM"), 0);
    Assert.assertEquals(other.getCigar().toString(), "8M2S");
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Test(org.testng.annotations.Test)

Aggregations

SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)72 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)35 Test (org.testng.annotations.Test)26 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)24 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)13 File (java.io.File)10 SAMFileHeader (htsjdk.samtools.SAMFileHeader)9 UserException (org.broadinstitute.hellbender.exceptions.UserException)8 DataProvider (org.testng.annotations.DataProvider)8 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)7 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 QueryInterval (htsjdk.samtools.QueryInterval)5 Allele (htsjdk.variant.variantcontext.Allele)4 VariantContext (htsjdk.variant.variantcontext.VariantContext)4 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)4 java.util (java.util)4 Collectors (java.util.stream.Collectors)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4