Search in sources :

Example 1 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class IntervalUtils method scatterContigIntervals.

/**
     * Splits an interval list into multiple files.
     * @param fileHeader The sam file header.
     * @param locs The genome locs to split.
     * @param scatterParts The output interval lists to write to.
     */
public static void scatterContigIntervals(final SAMFileHeader fileHeader, final List<GenomeLoc> locs, final List<File> scatterParts) {
    // Contract: must divide locs up so that each of scatterParts gets a sublist such that:
    // (a) all locs concerning a particular contig go to the same part
    // (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs)
    // Locs are already sorted.
    long totalBases = 0;
    for (final GenomeLoc loc : locs) {
        totalBases += loc.size();
    }
    final long idealBasesPerPart = totalBases / scatterParts.size();
    if (idealBasesPerPart == 0) {
        throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size()));
    }
    // Find the indices in locs where we switch from one contig to the next.
    final ArrayList<Integer> contigStartLocs = new ArrayList<>();
    String prevContig = null;
    for (int i = 0; i < locs.size(); ++i) {
        final GenomeLoc loc = locs.get(i);
        if (prevContig == null || !loc.getContig().equals(prevContig)) {
            contigStartLocs.add(i);
        }
        prevContig = loc.getContig();
    }
    if (contigStartLocs.size() < scatterParts.size()) {
        throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size()));
    }
    long thisPartBases = 0;
    int partIdx = 0;
    IntervalList outList = new IntervalList(fileHeader);
    for (int i = 0; i < locs.size(); ++i) {
        final GenomeLoc loc = locs.get(i);
        thisPartBases += loc.getStop() - loc.getStart();
        outList.add(toInterval(loc, i));
        boolean partMustStop = false;
        if (partIdx < (scatterParts.size() - 1)) {
            // If there are n contigs and n parts remaining then we must split here,
            // otherwise we will run out of contigs.
            final int nextPart = partIdx + 1;
            final int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size()));
            if (i + 1 == nextPartMustStartBy) {
                partMustStop = true;
            }
        } else if (i == locs.size() - 1) {
            // We're done! Write the last scatter file.
            partMustStop = true;
        }
        if (partMustStop || thisPartBases > idealBasesPerPart) {
            // Ideally we would split here. However, we must make sure to do so
            // on a contig boundary. Test always passes with partMustStop == true
            // since that indicates we're at a contig boundary.
            GenomeLoc nextLoc = null;
            if ((i + 1) < locs.size()) {
                nextLoc = locs.get(i + 1);
            }
            if (nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) {
                // Write out this part:
                outList.write(scatterParts.get(partIdx));
                // Reset. If this part ran long, leave the excess in thisPartBases
                // and the next will be a little shorter to compensate.
                outList = new IntervalList(fileHeader);
                thisPartBases -= idealBasesPerPart;
                ++partIdx;
            }
        }
    }
}
Also used : IntervalList(htsjdk.samtools.util.IntervalList)

Example 2 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class IntervalUtils method intervalFileToList.

/**
     * Read a file of genome locations to process. The file may be in Picard
     * or GATK interval format.
     *
     * @param glParser   GenomeLocParser
     * @param fileName  interval file
     * @return List<GenomeLoc> List of Genome Locs that have been parsed from file
     */
public static List<GenomeLoc> intervalFileToList(final GenomeLocParser glParser, final String fileName) {
    Utils.nonNull(glParser, "glParser is null");
    Utils.nonNull(fileName, "file name is null");
    final File inputFile = new File(fileName);
    final List<GenomeLoc> ret = new ArrayList<>();
    /**
         * First try to read the file as a Picard interval file since that's well structured --
         * we'll fail quickly if it's not a valid file.
         */
    boolean isPicardInterval = false;
    try {
        // Note: Picard will skip over intervals with contigs not in the sequence dictionary
        final IntervalList il = IntervalList.fromFile(inputFile);
        isPicardInterval = true;
        for (final Interval interval : il.getIntervals()) {
            // https://github.com/broadinstitute/gatk/issues/2089
            if (interval.getStart() - interval.getEnd() == 1) {
                logger.warn("Ignoring possibly incorrectly converted length 1 interval : " + interval);
            } else if (glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
                ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
            } else {
                throw new UserException(inputFile.getAbsolutePath() + " has an invalid interval : " + interval);
            }
        }
    }// if that didn't work, try parsing file as a GATK interval file
     catch (final Exception e) {
        if (// definitely a picard file, but we failed to parse
        isPicardInterval) {
            throw new UserException.CouldNotReadInputFile(inputFile, e);
        } else {
            try (XReadLines reader = new XReadLines(new File(fileName))) {
                for (final String line : reader) {
                    if (!line.trim().isEmpty()) {
                        ret.add(glParser.parseGenomeLoc(line));
                    }
                }
            } catch (final IOException e2) {
                throw new UserException.CouldNotReadInputFile(inputFile, e2);
            }
        }
    }
    if (ret.isEmpty()) {
        throw new UserException.MalformedFile(new File(fileName), "It contains no intervals.");
    }
    return ret;
}
Also used : IOException(java.io.IOException) IOException(java.io.IOException) UserException(org.broadinstitute.hellbender.exceptions.UserException) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) XReadLines(org.broadinstitute.hellbender.utils.text.XReadLines) IntervalList(htsjdk.samtools.util.IntervalList) UserException(org.broadinstitute.hellbender.exceptions.UserException) CachingIndexedFastaSequenceFile(org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile) File(java.io.File) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Example 3 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk-protected by broadinstitute.

the class CollectAllelicCounts method doWork.

@Override
protected Object doWork() {
    validateArguments();
    final File referenceFile = referenceArguments.getReferenceFile();
    final IntervalList siteIntervals = IntervalList.fromFile(inputSiteIntervalsFile);
    final AllelicCountCollector allelicCountCollector = new AllelicCountCollector(referenceFile, readValidationStringency);
    logger.info("Collecting allelic counts...");
    final AllelicCountCollection allelicCounts = allelicCountCollector.collect(inputBAMFile, siteIntervals, minimumMappingQuality, minimumBaseQuality);
    allelicCounts.write(outputAllelicCountsFile);
    logger.info("Allelic counts written to " + outputAllelicCountsFile.toString());
    return "SUCCESS";
}
Also used : AllelicCountCollector(org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollector) IntervalList(htsjdk.samtools.util.IntervalList) AllelicCountCollection(org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollection) File(java.io.File)

Example 4 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.

the class AllelicCountCollector method collect.

/**
     * Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list)
     * in a sorted BAM file.  Reads and bases below the specified mapping quality and base quality, respectively,
     * are filtered out of the pileup.  The alt count is defined as the total count minus the ref count, and the
     * alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the
     * bases in {@link AllelicCountCollector#BASES}.
     * @param bamFile           sorted BAM file
     * @param siteIntervals     interval list of sites
     * @param minMappingQuality minimum mapping quality required for reads to be included in pileup
     * @param minBaseQuality    minimum base quality required for bases to be included in pileup
     * @return                  AllelicCountCollection of ref/alt counts at sites in BAM file
     */
public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) {
    try (final SamReader reader = readerFactory.open(bamFile)) {
        ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative.");
        ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative.");
        if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }
        final int numberOfSites = siteIntervals.size();
        final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX;
        final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex);
        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
        locusIterator.setSamFilters(samFilters);
        locusIterator.setEmitUncoveredLoci(true);
        locusIterator.setIncludeNonPfReads(false);
        locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
        locusIterator.setQualityScoreCutoff(minBaseQuality);
        logger.info("Examining " + numberOfSites + " sites in total...");
        int locusCount = 0;
        final AllelicCountCollection counts = new AllelicCountCollection();
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " sites.");
            }
            locusCount++;
            final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            if (!BASES.contains(refBase)) {
                logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString()));
                continue;
            }
            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum();
            final int refReadCount = (int) baseCounts.get(refBase);
            //we take alt = total - ref instead of the actual alt count
            final int altReadCount = totalBaseCount - refReadCount;
            final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase);
            counts.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase));
        }
        logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined.");
        return counts;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException("Unable to collect allelic counts from " + bamFile);
    }
}
Also used : Arrays(java.util.Arrays) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) IOException(java.io.IOException) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) Utils(org.broadinstitute.hellbender.utils.Utils) htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) Collections(java.util.Collections) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) IOException(java.io.IOException) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Example 5 with IntervalList

use of htsjdk.samtools.util.IntervalList in project gatk-protected by broadinstitute.

the class HetPulldownCalculatorUnitTest method inputGetTumorHetPulldown15.

@DataProvider(name = "inputGetTumorHetPulldownMin15")
public Object[][] inputGetTumorHetPulldown15() {
    final Pulldown tumorHetPulldown = new Pulldown(normalHeader);
    tumorHetPulldown.add(new AllelicCount(new SimpleInterval("1", 14630, 14630), 9, 8));
    tumorHetPulldown.add(new AllelicCount(new SimpleInterval("2", 14689, 14689), 6, 9));
    final IntervalList normalHetIntervals = new IntervalList(tumorHeader);
    normalHetIntervals.add(new Interval("1", 14630, 14630));
    normalHetIntervals.add(new Interval("2", 14689, 14689));
    return new Object[][] { { normalHetIntervals, tumorHetPulldown } };
}
Also used : IntervalList(htsjdk.samtools.util.IntervalList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Interval(htsjdk.samtools.util.Interval) DataProvider(org.testng.annotations.DataProvider)

Aggregations

IntervalList (htsjdk.samtools.util.IntervalList)42 Interval (htsjdk.samtools.util.Interval)28 File (java.io.File)22 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)12 SAMFileHeader (htsjdk.samtools.SAMFileHeader)10 SamLocusIterator (htsjdk.samtools.util.SamLocusIterator)8 IOException (java.io.IOException)7 List (java.util.List)7 UserException (org.broadinstitute.hellbender.exceptions.UserException)7 SamReader (htsjdk.samtools.SamReader)6 SamRecordFilter (htsjdk.samtools.filter.SamRecordFilter)6 AllelicCount (org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)6 Nucleotide (org.broadinstitute.hellbender.utils.Nucleotide)6 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)6 DataProvider (org.testng.annotations.DataProvider)6 Arrays (java.util.Arrays)5 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)5 Test (org.testng.annotations.Test)5 htsjdk.samtools (htsjdk.samtools)4 QueryInterval (htsjdk.samtools.QueryInterval)4