Search in sources :

Example 6 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class ReferenceDataSourceUnitTest method testQueryAndPrefetch.

@Test(dataProvider = "ReferenceIntervalDataProvider")
public void testQueryAndPrefetch(final SimpleInterval interval, final String expectedBases) {
    try (ReferenceDataSource reference = new ReferenceFileSource(TEST_REFERENCE)) {
        ReferenceSequence queryResult = reference.queryAndPrefetch(interval);
        Assert.assertEquals(new String(queryResult.getBases()), expectedBases, "Wrong bases returned from queryAndPrefetch() for interval " + interval);
    }
}
Also used : ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 7 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk-protected by broadinstitute.

the class AnnotateTargetsIntegrationTest method checkOutputGCContent.

private void checkOutputGCContent(final ReferenceFileSource reference, final SimpleInterval interval, final double observed) {
    final ReferenceSequence sequence = reference.queryAndPrefetch(interval);
    int total = 0;
    int gc = 0;
    for (final byte base : sequence.getBases()) {
        switch(Character.toLowerCase(base)) {
            case 'g':
            case 'c':
                gc++;
                total++;
                break;
            case 'a':
            case 't':
            case 'u':
                total++;
                break;
            default:
        }
    }
    final double expectedGCContent = total == 0 ? Double.NaN : ((double) gc / (double) total);
    if (Double.isNaN(expectedGCContent)) {
        Assert.assertTrue(Double.isNaN(observed));
    } else {
        Assert.assertEquals(observed, expectedGCContent, 0.0001);
    }
}
Also used : ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence)

Example 8 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class CollectRrbsMetrics method doWork.

@Override
protected Object doWork() {
    if (!METRICS_FILE_PREFIX.endsWith(".")) {
        METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + ".";
    }
    final File SUMMARY_OUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION);
    final File DETAILS_OUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION);
    final File PLOTS_OUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION);
    assertIoFiles(SUMMARY_OUT, DETAILS_OUT, PLOTS_OUT);
    final SamReader samReader = SamReaderFactory.makeDefault().open(INPUT);
    if (!ASSUME_SORTED && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new UserException("The input file " + INPUT.getAbsolutePath() + " does not appear to be coordinate sorted");
    }
    final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final ProgressLogger progressLogger = new ProgressLogger(logger);
    final RrbsMetricsCollector metricsCollector = new RrbsMetricsCollector(METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(), C_QUALITY_THRESHOLD, NEXT_BASE_QUALITY_THRESHOLD, MINIMUM_READ_LENGTH, MAX_MISMATCH_RATE);
    for (final SAMRecord samRecord : samReader) {
        progressLogger.record(samRecord);
        if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) {
            final ReferenceSequence referenceSequence = refWalker.get(samRecord.getReferenceIndex());
            metricsCollector.acceptRecord(samRecord, referenceSequence);
        }
    }
    metricsCollector.finish();
    final MetricsFile<RrbsMetrics, Long> rrbsMetrics = getMetricsFile();
    metricsCollector.addAllLevelsToFile(rrbsMetrics);
    // Using RrbsMetrics as a way to get both of the metrics objects through the MultiLevelCollector. Once
    // we get it out split it apart to the two separate MetricsFiles and write them to file
    final MetricsFile<RrbsSummaryMetrics, ?> summaryFile = getMetricsFile();
    final MetricsFile<RrbsCpgDetailMetrics, ?> detailsFile = getMetricsFile();
    for (final RrbsMetrics rrbsMetric : rrbsMetrics.getMetrics()) {
        summaryFile.addMetric(rrbsMetric.getSummaryMetrics());
        for (final RrbsCpgDetailMetrics detailMetric : rrbsMetric.getDetailMetrics()) {
            detailsFile.addMetric(detailMetric);
        }
    }
    summaryFile.write(SUMMARY_OUT);
    detailsFile.write(DETAILS_OUT);
    if (PRODUCE_PLOT) {
        final RScriptExecutor executor = new RScriptExecutor();
        executor.addScript(new Resource(R_SCRIPT, CollectRrbsMetrics.class));
        executor.addArgs(DETAILS_OUT.getAbsolutePath(), SUMMARY_OUT.getAbsolutePath(), PLOTS_OUT.getAbsolutePath());
        executor.exec();
    }
    CloserUtil.close(samReader);
    return null;
}
Also used : Resource(org.broadinstitute.hellbender.utils.io.Resource) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) MetricsFile(htsjdk.samtools.metrics.MetricsFile) File(java.io.File)

Example 9 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class CountingPairedFilter method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(logger, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
    // Load up the reference sequence and double check sequence dictionaries
    if (!in.getFileHeader().getSequenceDictionary().isEmpty()) {
        SequenceUtil.assertSequenceDictionariesEqual(in.getFileHeader().getSequenceDictionary(), refWalker.getSequenceDictionary());
    }
    final SamLocusIterator iterator = new SamLocusIterator(in);
    final List<SamRecordFilter> filters = new ArrayList<>();
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    filters.add(pairFilter);
    // Not a counting filter because we never want to count reads twice
    filters.add(new SecondaryAlignmentFilter());
    iterator.setSamFilters(filters);
    iterator.setEmitUncoveredLoci(true);
    // Handled separately because we want to count bases
    iterator.setMappingQualityScoreCutoff(0);
    // Handled separately because we want to count bases
    iterator.setQualityScoreCutoff(0);
    iterator.setIncludeNonPfReads(false);
    final int max = COVERAGE_CAP;
    final long[] HistogramArray = new long[max + 1];
    final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
    final boolean usingStopAfter = STOP_AFTER > 0;
    final long stopAfter = STOP_AFTER - 1;
    long counter = 0;
    long basesExcludedByBaseq = 0;
    long basesExcludedByOverlap = 0;
    long basesExcludedByCapping = 0;
    // Loop through all the loci
    while (iterator.hasNext()) {
        final SamLocusIterator.LocusInfo info = iterator.next();
        // Check that the reference is not N
        final ReferenceSequence ref = refWalker.get(info.getSequenceIndex());
        final byte base = ref.getBases()[info.getPosition() - 1];
        if (base == 'N')
            continue;
        // Figure out the coverage while not counting overlapping reads twice, and excluding various things
        final Set<String> readNames = new HashSet<>(info.getRecordAndOffsets().size());
        int pileupSize = 0;
        for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndOffsets()) {
            if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) {
                ++basesExcludedByBaseq;
                continue;
            }
            if (!readNames.add(recs.getRecord().getReadName())) {
                ++basesExcludedByOverlap;
                continue;
            }
            pileupSize++;
            if (pileupSize <= max) {
                baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
            }
        }
        final int depth = Math.min(readNames.size(), max);
        if (depth < readNames.size())
            basesExcludedByCapping += readNames.size() - max;
        HistogramArray[depth]++;
        // Record progress and perhaps stop
        progress.record(info.getSequenceName(), info.getPosition());
        if (usingStopAfter && ++counter > stopAfter)
            break;
    }
    // Construct and write the outputs
    final Histogram<Integer> histo = new Histogram<>("coverage", "count");
    for (int i = 0; i < HistogramArray.length; ++i) {
        histo.increment(i, HistogramArray[i]);
    }
    // Construct and write the outputs
    final Histogram<Integer> baseQHisto = new Histogram<>("value", "baseq_count");
    for (int i = 0; i < baseQHistogramArray.length; ++i) {
        baseQHisto.increment(i, baseQHistogramArray[i]);
    }
    final WgsMetrics metrics = generateWgsMetrics();
    metrics.GENOME_TERRITORY = (long) histo.getSumOfValues();
    metrics.MEAN_COVERAGE = histo.getMean();
    metrics.SD_COVERAGE = histo.getStandardDeviation();
    metrics.MEDIAN_COVERAGE = histo.getMedian();
    metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();
    final long basesExcludedByDupes = dupeFilter.getFilteredBases();
    final long basesExcludedByMapq = mapqFilter.getFilteredBases();
    final long basesExcludedByPairing = pairFilter.getFilteredBases();
    final double total = histo.getSum();
    final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping;
    metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
    metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
    metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
    metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
    metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
    metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
    metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;
    metrics.PCT_5X = MathUtils.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_10X = MathUtils.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_15X = MathUtils.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_20X = MathUtils.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_25X = MathUtils.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_30X = MathUtils.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_40X = MathUtils.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_50X = MathUtils.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_60X = MathUtils.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_70X = MathUtils.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_80X = MathUtils.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_90X = MathUtils.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_100X = MathUtils.sum(HistogramArray, 100, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(histo);
    if (INCLUDE_BQ_HISTOGRAM) {
        out.addHistogram(baseQHisto);
    }
    out.write(OUTPUT);
    return null;
}
Also used : Histogram(htsjdk.samtools.util.Histogram) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) ArrayList(java.util.ArrayList) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) SamReader(htsjdk.samtools.SamReader) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) SecondaryAlignmentFilter(htsjdk.samtools.filter.SecondaryAlignmentFilter) HashSet(java.util.HashSet) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator)

Example 10 with ReferenceSequence

use of htsjdk.samtools.reference.ReferenceSequence in project gatk by broadinstitute.

the class CreateSequenceDictionary method makeSequenceDictionary.

/**
     * Read all the sequences from the given reference file, and convert into SAMSequenceRecords
     * @param referenceFile fasta or fasta.gz
     * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
     */
SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
    final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, true);
    ReferenceSequence refSeq;
    final List<SAMSequenceRecord> ret = new ArrayList<>();
    final Set<String> sequenceNames = new HashSet<>();
    for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) {
        if (sequenceNames.contains(refSeq.getName())) {
            throw new UserException.MalformedFile(referenceFile, "Sequence name appears more than once in reference: " + refSeq.getName());
        }
        sequenceNames.add(refSeq.getName());
        ret.add(makeSequenceRecord(refSeq));
    }
    return new SAMSequenceDictionary(ret);
}
Also used : ArrayList(java.util.ArrayList) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) HashSet(java.util.HashSet)

Aggregations

ReferenceSequence (htsjdk.samtools.reference.ReferenceSequence)14 ReferenceSequenceFile (htsjdk.samtools.reference.ReferenceSequenceFile)5 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)3 SamReader (htsjdk.samtools.SamReader)3 ReferenceSequenceFileWalker (htsjdk.samtools.reference.ReferenceSequenceFileWalker)3 File (java.io.File)3 UserException (org.broadinstitute.hellbender.exceptions.UserException)3 ProgressLogger (org.broadinstitute.hellbender.utils.runtime.ProgressLogger)3 SAMRecord (htsjdk.samtools.SAMRecord)2 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)2 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)2 Test (org.testng.annotations.Test)2 SAMException (htsjdk.samtools.SAMException)1 SortOrder (htsjdk.samtools.SAMFileHeader.SortOrder)1 SamRecordFilter (htsjdk.samtools.filter.SamRecordFilter)1 SecondaryAlignmentFilter (htsjdk.samtools.filter.SecondaryAlignmentFilter)1 MetricsFile (htsjdk.samtools.metrics.MetricsFile)1