Search in sources :

Example 1 with PeekableIterator

use of htsjdk.samtools.util.PeekableIterator in project gatk by broadinstitute.

the class EstimateLibraryComplexity method doWork.

/**
     * Method that does most of the work.  Reads through the input BAM file and extracts the
     * read sequences of each read pair and sorts them via a SortingCollection.  Then traverses
     * the sorted reads and looks at small groups at a time to find duplicates.
     */
@Override
protected Object doWork() {
    for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
    logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
    final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
    final int recordsRead = 0;
    final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
    // Loop through the input files and pick out the read sequences etc.
    final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
    for (final File f : INPUT) {
        final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
        final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
        readGroups.addAll(in.getFileHeader().getReadGroups());
        for (final SAMRecord rec : in) {
            if (!rec.getReadPairedFlag())
                continue;
            if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
                continue;
            }
            PairedReadSequence prs = pendingByName.remove(rec.getReadName());
            if (prs == null) {
                // Make a new paired read object and add RG and physical location information to it
                prs = new PairedReadSequence();
                if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
                    final SAMReadGroupRecord rg = rec.getReadGroup();
                    if (rg != null)
                        prs.setReadGroup((short) readGroups.indexOf(rg));
                }
                pendingByName.put(rec.getReadName(), prs);
            }
            // Read passes quality check if both ends meet the mean quality criteria
            final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
            prs.qualityOk = prs.qualityOk && passesQualityCheck;
            // Get the bases and restore them to their original orientation if necessary
            final byte[] bases = rec.getReadBases();
            if (rec.getReadNegativeStrandFlag())
                SequenceUtil.reverseComplement(bases);
            if (rec.getFirstOfPairFlag()) {
                prs.read1 = bases;
            } else {
                prs.read2 = bases;
            }
            if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
                sorter.add(prs);
            }
            progress.record(rec);
        }
        CloserUtil.close(in);
    }
    logger.info("Finished reading - moving on to scanning for duplicates.");
    // Now go through the sorted reads and attempt to find duplicates
    try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
        final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
        final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
        int groupsProcessed = 0;
        long lastLogTime = System.currentTimeMillis();
        final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
        while (iterator.hasNext()) {
            // Get the next group and split it apart by library
            final List<PairedReadSequence> group = getNextGroup(iterator);
            if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
                final PairedReadSequence prs = group.get(0);
                logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
            } else {
                final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
                // Now process the reads by library
                for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
                    final String library = entry.getKey();
                    final List<PairedReadSequence> seqs = entry.getValue();
                    Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
                    Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
                    if (duplicationHisto == null) {
                        duplicationHisto = new Histogram<>("duplication_group_count", library);
                        opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
                        duplicationHistosByLibrary.put(library, duplicationHisto);
                        opticalHistosByLibrary.put(library, opticalHisto);
                    }
                    // Figure out if any reads within this group are duplicates of one another
                    for (int i = 0; i < seqs.size(); ++i) {
                        final PairedReadSequence lhs = seqs.get(i);
                        if (lhs == null)
                            continue;
                        final List<PairedReadSequence> dupes = new ArrayList<>();
                        for (int j = i + 1; j < seqs.size(); ++j) {
                            final PairedReadSequence rhs = seqs.get(j);
                            if (rhs == null)
                                continue;
                            if (matches(lhs, rhs, MAX_DIFF_RATE)) {
                                dupes.add(rhs);
                                seqs.set(j, null);
                            }
                        }
                        if (!dupes.isEmpty()) {
                            dupes.add(lhs);
                            final int duplicateCount = dupes.size();
                            duplicationHisto.increment(duplicateCount);
                            final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
                            for (final boolean b : flags) {
                                if (b)
                                    opticalHisto.increment(duplicateCount);
                            }
                        } else {
                            duplicationHisto.increment(1);
                        }
                    }
                }
                ++groupsProcessed;
                if (lastLogTime < System.currentTimeMillis() - 60000) {
                    logger.info("Processed " + groupsProcessed + " groups.");
                    lastLogTime = System.currentTimeMillis();
                }
            }
        }
        sorter.cleanup();
        final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
        for (final String library : duplicationHistosByLibrary.keySet()) {
            final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
            final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
            final DuplicationMetrics metrics = new DuplicationMetrics();
            metrics.LIBRARY = library;
            // Filter out any bins that have only a single entry in them and calcu
            for (final Integer bin : duplicationHisto.keySet()) {
                final double duplicateGroups = duplicationHisto.get(bin).getValue();
                final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
                if (duplicateGroups > 1) {
                    metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
                    metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
                    metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
                }
            }
            metrics.calculateDerivedMetrics();
            file.addMetric(metrics);
            file.addHistogram(duplicationHisto);
        }
        file.write(OUTPUT);
    }
    return null;
}
Also used : Histogram(htsjdk.samtools.util.Histogram) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) HashMap(java.util.HashMap) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) ArrayList(java.util.ArrayList) List(java.util.List) SAMRecord(htsjdk.samtools.SAMRecord) PeekableIterator(htsjdk.samtools.util.PeekableIterator) MetricsFile(htsjdk.samtools.metrics.MetricsFile) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with PeekableIterator

use of htsjdk.samtools.util.PeekableIterator in project gatk by broadinstitute.

the class FixMateInformation method doWork.

@Override
protected Object doWork() {
    // Open up the input
    boolean allQueryNameSorted = true;
    final List<SamReader> readers = new ArrayList<>();
    for (final File f : INPUT) {
        IOUtil.assertFileIsReadable(f);
        final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
        readers.add(reader);
        if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname)
            allQueryNameSorted = false;
    }
    // or into a temporary file that will overwrite the INPUT file eventually
    if (OUTPUT != null)
        OUTPUT = OUTPUT.getAbsoluteFile();
    final boolean differentOutputSpecified = OUTPUT != null;
    if (differentOutputSpecified) {
        IOUtil.assertFileIsWritable(OUTPUT);
    } else if (INPUT.size() != 1) {
        throw new UserException("Must specify either an explicit OUTPUT file or a single INPUT file to be overridden.");
    } else {
        final File soleInput = INPUT.get(0).getAbsoluteFile();
        final File dir = soleInput.getParentFile().getAbsoluteFile();
        try {
            IOUtil.assertFileIsWritable(soleInput);
            IOUtil.assertDirectoryIsWritable(dir);
            OUTPUT = File.createTempFile(soleInput.getName() + ".being_fixed.", BamFileIoUtils.BAM_FILE_EXTENSION, dir);
        } catch (final IOException ioe) {
            throw new RuntimeIOException("Could not create tmp file in " + dir.getAbsolutePath());
        }
    }
    // Get the input records merged and sorted by query name as needed
    final PeekableIterator<SAMRecord> iterator;
    final SAMFileHeader header;
    {
        // Deal with merging if necessary
        final Iterator<SAMRecord> tmp;
        if (INPUT.size() > 1) {
            final List<SAMFileHeader> headers = new ArrayList<>(readers.size());
            for (final SamReader reader : readers) {
                headers.add(reader.getFileHeader());
            }
            final SAMFileHeader.SortOrder sortOrder = (allQueryNameSorted ? SAMFileHeader.SortOrder.queryname : SAMFileHeader.SortOrder.unsorted);
            final SamFileHeaderMerger merger = new SamFileHeaderMerger(sortOrder, headers, false);
            tmp = new MergingSamRecordIterator(merger, readers, false);
            header = merger.getMergedHeader();
        } else {
            tmp = readers.get(0).iterator();
            header = readers.get(0).getFileHeader();
        }
        // And now deal with re-sorting if necessary
        if (ASSUME_SORTED || allQueryNameSorted) {
            iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<>(tmp), ADD_MATE_CIGAR);
        } else {
            logger.info("Sorting input into queryname order.");
            final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
            while (tmp.hasNext()) {
                sorter.add(tmp.next());
            }
            iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<SAMRecord>(sorter.iterator()) {

                @Override
                public void close() {
                    super.close();
                    sorter.cleanup();
                }
            }, ADD_MATE_CIGAR);
            logger.info("Sorting by queryname complete.");
        }
        // Deal with the various sorting complications
        final SAMFileHeader.SortOrder outputSortOrder = SORT_ORDER == null ? readers.get(0).getFileHeader().getSortOrder() : SORT_ORDER;
        logger.info("Output will be sorted by " + outputSortOrder);
        header.setSortOrder(outputSortOrder);
    }
    if (CREATE_INDEX && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new UserException("Can't CREATE_INDEX unless sort order is coordinate");
    }
    try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, header, header.getSortOrder() == SAMFileHeader.SortOrder.queryname)) {
        logger.info("Traversing query name sorted records and fixing up mate pair information.");
        final ProgressLogger progress = new ProgressLogger(logger);
        while (iterator.hasNext()) {
            final SAMRecord record = iterator.next();
            out.addAlignment(record);
            progress.record(record);
        }
        iterator.close();
        if (header.getSortOrder() == SAMFileHeader.SortOrder.queryname) {
            logger.info("Closing output file.");
        } else {
            logger.info("Finished processing reads; re-sorting output file.");
        }
    }
    // TODO throw appropriate exceptions instead of writing to log.error and returning
    if (!differentOutputSpecified) {
        logger.info("Replacing input file with fixed file.");
        final File soleInput = INPUT.get(0).getAbsoluteFile();
        final File old = new File(soleInput.getParentFile(), soleInput.getName() + ".old");
        if (!old.exists() && soleInput.renameTo(old)) {
            if (OUTPUT.renameTo(soleInput)) {
                if (!old.delete()) {
                    logger.warn("Could not delete old file: " + old.getAbsolutePath());
                    return null;
                }
                if (CREATE_INDEX) {
                    final File newIndex = new File(OUTPUT.getParent(), OUTPUT.getName().substring(0, OUTPUT.getName().length() - 4) + ".bai");
                    final File oldIndex = new File(soleInput.getParent(), soleInput.getName().substring(0, soleInput.getName().length() - 4) + ".bai");
                    if (!newIndex.renameTo(oldIndex)) {
                        logger.warn("Could not overwrite index file: " + oldIndex.getAbsolutePath());
                    }
                }
            } else {
                logger.error("Could not move new file to " + soleInput.getAbsolutePath());
                logger.error("Input file preserved as: " + old.getAbsolutePath());
                logger.error("New file preserved as: " + OUTPUT.getAbsolutePath());
                return null;
            }
        } else {
            logger.error("Could not move input file out of the way: " + soleInput.getAbsolutePath());
            if (!OUTPUT.delete()) {
                logger.error("Could not delete temporary file: " + OUTPUT.getAbsolutePath());
            }
            return null;
        }
    }
    CloserUtil.close(readers);
    return null;
}
Also used : MergingSamRecordIterator(htsjdk.samtools.MergingSamRecordIterator) SortingCollection(htsjdk.samtools.util.SortingCollection) ArrayList(java.util.ArrayList) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) MergingSamRecordIterator(htsjdk.samtools.MergingSamRecordIterator) PeekableIterator(htsjdk.samtools.util.PeekableIterator) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) SamFileHeaderMerger(htsjdk.samtools.SamFileHeaderMerger) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) BAMRecordCodec(htsjdk.samtools.BAMRecordCodec) SAMRecord(htsjdk.samtools.SAMRecord) SAMRecordQueryNameComparator(htsjdk.samtools.SAMRecordQueryNameComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Example 3 with PeekableIterator

use of htsjdk.samtools.util.PeekableIterator in project gatk by broadinstitute.

the class AssemblyRegion method createFromReadShard.

/**
     * Divide a read shard up into one or more AssemblyRegions using the provided AssemblyRegionEvaluator to find
     * the borders between "active" and "inactive" regions within the shard.
     *
     * @param shard Shard to divide into assembly regions
     * @param readsHeader header for the reads
     * @param referenceContext reference data overlapping the shard's extended span (including padding)
     * @param features features overlapping the shard's extended span (including padding)
     * @param evaluator AssemblyRegionEvaluator used to label each locus as either active or inactive
     * @param minRegionSize minimum size for each assembly region
     * @param maxRegionSize maximum size for each assembly region
     * @param assemblyRegionPadding each assembly region will be padded by this amount on each side
     * @param activeProbThreshold minimum probability for a site to be considered active, as reported by the provided evaluator
     * @param maxProbPropagationDistance maximum number of bases probabilities can propagate in each direction when finding region boundaries
     * @return a Iterable over one or more AssemblyRegions, each marked as either "active" or "inactive", spanning
     *         part of the provided Shard, and filled with all reads that overlap the region.
     */
public static Iterable<AssemblyRegion> createFromReadShard(final Shard<GATKRead> shard, final SAMFileHeader readsHeader, final ReferenceContext referenceContext, final FeatureContext features, final AssemblyRegionEvaluator evaluator, final int minRegionSize, final int maxRegionSize, final int assemblyRegionPadding, final double activeProbThreshold, final int maxProbPropagationDistance) {
    Utils.nonNull(shard);
    Utils.nonNull(readsHeader);
    Utils.nonNull(referenceContext);
    Utils.nonNull(features);
    Utils.nonNull(evaluator);
    Utils.validateArg(minRegionSize >= 1, "minRegionSize must be >= 1");
    Utils.validateArg(maxRegionSize >= 1, "maxRegionSize must be >= 1");
    Utils.validateArg(minRegionSize <= maxRegionSize, "minRegionSize must be <= maxRegionSize");
    Utils.validateArg(assemblyRegionPadding >= 0, "assemblyRegionPadding must be >= 0");
    Utils.validateArg(activeProbThreshold >= 0.0, "activeProbThreshold must be >= 0.0");
    Utils.validateArg(maxProbPropagationDistance >= 0, "maxProbPropagationDistance must be >= 0");
    // TODO: refactor this method so that we don't need to load all reads from the shard into memory at once!
    final List<GATKRead> windowReads = new ArrayList<>();
    for (final GATKRead read : shard) {
        windowReads.add(read);
    }
    final LocusIteratorByState locusIterator = new LocusIteratorByState(windowReads.iterator(), DownsamplingMethod.NONE, false, ReadUtils.getSamplesFromHeader(readsHeader), readsHeader, false);
    final ActivityProfile activityProfile = new BandPassActivityProfile(null, maxProbPropagationDistance, activeProbThreshold, BandPassActivityProfile.MAX_FILTER_SIZE, BandPassActivityProfile.DEFAULT_SIGMA, readsHeader);
    // First, use our activity profile to determine the bounds of each assembly region:
    List<AssemblyRegion> assemblyRegions = determineAssemblyRegionBounds(shard, locusIterator, activityProfile, readsHeader, referenceContext, features, evaluator, minRegionSize, maxRegionSize, assemblyRegionPadding);
    // Then, fill the assembly regions with overlapping reads from the shard:
    final PeekableIterator<GATKRead> reads = new PeekableIterator<>(windowReads.iterator());
    fillAssemblyRegionsWithReads(assemblyRegions, reads);
    return assemblyRegions;
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) ActivityProfile(org.broadinstitute.hellbender.utils.activityprofile.ActivityProfile) BandPassActivityProfile(org.broadinstitute.hellbender.utils.activityprofile.BandPassActivityProfile) PeekableIterator(htsjdk.samtools.util.PeekableIterator) BandPassActivityProfile(org.broadinstitute.hellbender.utils.activityprofile.BandPassActivityProfile)

Aggregations

PeekableIterator (htsjdk.samtools.util.PeekableIterator)3 SAMRecord (htsjdk.samtools.SAMRecord)2 SamReader (htsjdk.samtools.SamReader)2 File (java.io.File)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 ProgressLogger (org.broadinstitute.hellbender.utils.runtime.ProgressLogger)2 BAMRecordCodec (htsjdk.samtools.BAMRecordCodec)1 MergingSamRecordIterator (htsjdk.samtools.MergingSamRecordIterator)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMFileWriter (htsjdk.samtools.SAMFileWriter)1 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)1 SAMRecordQueryNameComparator (htsjdk.samtools.SAMRecordQueryNameComparator)1 SamFileHeaderMerger (htsjdk.samtools.SamFileHeaderMerger)1 MetricsFile (htsjdk.samtools.metrics.MetricsFile)1 Histogram (htsjdk.samtools.util.Histogram)1 RuntimeIOException (htsjdk.samtools.util.RuntimeIOException)1 SortingCollection (htsjdk.samtools.util.SortingCollection)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1