Search in sources :

Example 6 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class HetPulldownCalculatorUnitTest method inputGetPileupBaseCount.

@DataProvider(name = "inputGetPileupBaseCount")
public Object[][] inputGetPileupBaseCount() throws IOException {
    try (final SamReader bamReader = SamReaderFactory.makeDefault().open(NORMAL_BAM_FILE)) {
        final IntervalList intervals = new IntervalList(bamReader.getFileHeader());
        intervals.add(new Interval("1", 100, 100));
        intervals.add(new Interval("1", 11000, 11000));
        intervals.add(new Interval("1", 14000, 14000));
        intervals.add(new Interval("1", 14630, 14630));
        final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, intervals);
        final Nucleotide.Counter baseCounts1 = makeBaseCounts(0, 0, 0, 0);
        final Nucleotide.Counter baseCounts2 = makeBaseCounts(0, 9, 0, 0);
        final Nucleotide.Counter baseCounts3 = makeBaseCounts(12, 0, 0, 0);
        final Nucleotide.Counter baseCounts4 = makeBaseCounts(0, 0, 8, 9);
        if (!locusIterator.hasNext()) {
            throw new SAMException("Can't get locus to start iteration. Check that " + NORMAL_BAM_FILE.toString() + " contains 1:0-16000.");
        }
        final SamLocusIterator.LocusInfo locus1 = locusIterator.next();
        final SamLocusIterator.LocusInfo locus2 = locusIterator.next();
        final SamLocusIterator.LocusInfo locus3 = locusIterator.next();
        final SamLocusIterator.LocusInfo locus4 = locusIterator.next();
        locusIterator.close();
        return new Object[][] { { locus1, baseCounts1 }, { locus2, baseCounts2 }, { locus3, baseCounts3 }, { locus4, baseCounts4 } };
    }
}
Also used : SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Interval(htsjdk.samtools.util.Interval) DataProvider(org.testng.annotations.DataProvider)

Example 7 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class Pulldown method getIntervals.

/** Returns a new instance of an IntervalList, constructed from the intervals of the internally held
     * AllelicCounts.  This IntervalList is modifiable and does not change with the state of the Pulldown.   */
public IntervalList getIntervals() {
    final IntervalList intervals = new IntervalList(header);
    intervals.addall(getCounts().stream().map(AllelicCount::getInterval).map(si -> new Interval(si.getContig(), si.getStart(), si.getEnd())).collect(Collectors.toList()));
    return intervals;
}
Also used : IntervalList(htsjdk.samtools.util.IntervalList) Interval(htsjdk.samtools.util.Interval)

Example 8 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class BedToIntervalList method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
    IOUtil.assertFileIsWritable(OUTPUT);
    try {
        final SAMFileHeader header = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY);
        final IntervalList intervalList = new IntervalList(header);
        /**
             * NB: BED is zero-based, but a BEDCodec by default (since it is returns tribble Features) has an offset of one,
             * so it returns 1-based starts.  Ugh.  Set to zero.
             */
        final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(BEDCodec.StartOffset.ZERO), false);
        final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
        final ProgressLogger progressLogger = new ProgressLogger(logger, (int) 1e6);
        while (iterator.hasNext()) {
            final BEDFeature bedFeature = iterator.next();
            final String sequenceName = bedFeature.getContig();
            /**
                 * NB: BED is zero-based, so we need to add one here to make it one-based.  Please observe we set the start
                 * offset to zero when creating the BEDCodec.
                 */
            final int start = bedFeature.getStart() + 1;
            /**
                 * NB: BED is 0-based OPEN (which, for the end is equivalent to 1-based closed).
                 */
            final int end = bedFeature.getEnd();
            // NB: do not use an empty name within an interval
            String name = bedFeature.getName();
            if (name.isEmpty())
                name = null;
            final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);
            // Do some validation
            if (null == sequenceRecord) {
                throw new GATKException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
            } else if (start < 1) {
                throw new GATKException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
            } else if (sequenceRecord.getSequenceLength() < start) {
                throw new GATKException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
            } else if (end < 1) {
                throw new GATKException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
            } else if (sequenceRecord.getSequenceLength() < end) {
                throw new GATKException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
            } else if (end < start - 1) {
                throw new GATKException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
            }
            final Interval interval = new Interval(sequenceName, start, end, bedFeature.getStrand() == Strand.POSITIVE, name);
            intervalList.add(interval);
            progressLogger.record(sequenceName, start);
        }
        CloserUtil.close(bedReader);
        // Sort and write the output
        intervalList.uniqued().write(OUTPUT);
    } catch (final IOException e) {
        throw new RuntimeException(e);
    }
    return null;
}
Also used : ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IOException(java.io.IOException) IntervalList(htsjdk.samtools.util.IntervalList) BEDFeature(htsjdk.tribble.bed.BEDFeature) SAMFileHeader(htsjdk.samtools.SAMFileHeader) BEDCodec(htsjdk.tribble.bed.BEDCodec) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Interval(htsjdk.samtools.util.Interval)

Example 9 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class IntervalUtilsUnitTest method testIntervalFileToListNegativeOneLength.

// TODO - remove once a corrected version of the exome interval list is released.
@Test(dataProvider = "negativeOneLengthIntervalTestData")
public void testIntervalFileToListNegativeOneLength(GenomeLocParser genomeLocParser, String contig, int intervalStart, int intervalEnd) throws Exception {
    final SAMFileHeader picardFileHeader = new SAMFileHeader();
    picardFileHeader.addSequence(genomeLocParser.getContigInfo("1"));
    final IntervalList picardIntervals = new IntervalList(picardFileHeader);
    // Need one good interval or else a UserException.MalformedFile( is thrown if no intervals
    picardIntervals.add(new Interval(contig, 1, 2, true, "dummyname0"));
    picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname1"));
    final File picardIntervalFile = createTempFile("testIntervalFileToListNegativeOneLength", ".intervals");
    picardIntervals.write(picardIntervalFile);
    IntervalUtils.intervalFileToList(genomeLocParser, picardIntervalFile.getAbsolutePath());
}
Also used : IntervalList(htsjdk.samtools.util.IntervalList) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 10 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk-protected by broadinstitute.

the class HetPulldownCalculator method getHetPulldown.

/**
     * For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
     * where intervals give positions of likely heterozygous SNP sites.
     *
     * <p>
     *     For a normal sample:
     *     <ul>
     *         The IntervalList snpIntervals gives common SNP sites in 1-based format.
     *     </ul>
     *     <ul>
     *         The p-value threshold must be specified for a two-sided binomial test,
     *         which is used to determine SNP sites from snpIntervals that are
     *         compatible with a heterozygous SNP, given the sample.  Only these sites are output.
     *     </ul>
     * </p>
     * <p>
     *     For a tumor sample:
     *     <ul>
     *         The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
     *         This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
     *         Only these sites are output.
     *     </ul>
     * </p>
     * @param bamFile           sorted BAM file for sample
     * @param snpIntervals      IntervalList of SNP sites
     * @param sampleType        flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
     *                          (determines whether to perform binomial test)
     * @param pvalThreshold     p-value threshold for two-sided binomial test, used for normal sample
     * @param minimumRawReads   minimum number of total reads that must be present at a het site
     * @return                  Pulldown of heterozygous SNP sites in 1-based format
     */
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
        final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }
        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
        final int totalNumberOfSNPs = snpIntervals.size();
        final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
        locusIterator.setSamFilters(samFilters);
        locusIterator.setEmitUncoveredLoci(false);
        locusIterator.setIncludeNonPfReads(false);
        locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
        locusIterator.setQualityScoreCutoff(minBaseQuality);
        logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " covered sites.");
            }
            locusCount++;
            //include N, etc. reads here
            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount < minimumRawReads) {
                continue;
            }
            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
            if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
                continue;
            }
            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            final int refReadCount = (int) baseCounts.get(refBase);
            final int altReadCount = totalBaseCount - refReadCount;
            hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
        }
        logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
        return hetPulldown;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
    }
}
Also used : Arrays(java.util.Arrays) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) AlternativeHypothesis(org.apache.commons.math3.stat.inference.AlternativeHypothesis) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount) IOException(java.io.IOException) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) BinomialTest(org.apache.commons.math3.stat.inference.BinomialTest) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) VisibleForTesting(com.google.common.annotations.VisibleForTesting) htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) IOException(java.io.IOException) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)

Aggregations

IntervalList (htsjdk.samtools.util.IntervalList)37 Interval (htsjdk.samtools.util.Interval)23 File (java.io.File)18 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)7 SAMFileHeader (htsjdk.samtools.SAMFileHeader)6 SamLocusIterator (htsjdk.samtools.util.SamLocusIterator)6 IOException (java.io.IOException)6 List (java.util.List)6 AllelicCount (org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)6 Nucleotide (org.broadinstitute.hellbender.utils.Nucleotide)6 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)6 DataProvider (org.testng.annotations.DataProvider)6 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)5 Test (org.testng.annotations.Test)5 htsjdk.samtools (htsjdk.samtools)4 QueryInterval (htsjdk.samtools.QueryInterval)4 DuplicateReadFilter (htsjdk.samtools.filter.DuplicateReadFilter)4 NotPrimaryAlignmentFilter (htsjdk.samtools.filter.NotPrimaryAlignmentFilter)4 SamRecordFilter (htsjdk.samtools.filter.SamRecordFilter)4