Search in sources :

Example 1 with SAMRecordQueryNameComparator

use of htsjdk.samtools.SAMRecordQueryNameComparator in project gatk by broadinstitute.

the class RevertOriginalBaseQualitiesAndAddMateCigar method doWork.

@Override
public Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    boolean foundPairedMappedReads = false;
    // Check if we can skip this file since it does not have OQ tags and the mate cigar tag is already there.
    final CanSkipSamFile skipSamFile = RevertOriginalBaseQualitiesAndAddMateCigar.canSkipSAMFile(INPUT, MAX_RECORDS_TO_EXAMINE, RESTORE_ORIGINAL_QUALITIES, REFERENCE_SEQUENCE);
    logger.info(skipSamFile.getMessage(MAX_RECORDS_TO_EXAMINE));
    if (skipSamFile.canSkip())
        return null;
    final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).enable(SamReaderFactory.Option.EAGERLY_DECODE).open(INPUT);
    final SAMFileHeader inHeader = in.getFileHeader();
    // Build the output writer based on the correct sort order
    final SAMFileHeader outHeader = ReadUtils.cloneSAMFileHeader(inHeader);
    // same as the input
    if (null == SORT_ORDER)
        this.SORT_ORDER = inHeader.getSortOrder();
    outHeader.setSortOrder(SORT_ORDER);
    try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, false)) {
        // Iterate over the records, revert original base qualities, and push them into a SortingCollection by queryname
        final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
        final ProgressLogger revertingProgress = new ProgressLogger(logger, 1000000, " reverted OQs");
        int numOriginalQualitiesRestored = 0;
        for (final SAMRecord record : in) {
            // Clean up reads that map off the end of the reference
            AbstractAlignmentMerger.createNewCigarsIfMapsOffEndOfReference(record);
            if (RESTORE_ORIGINAL_QUALITIES && null != record.getOriginalBaseQualities()) {
                // revert the original base qualities
                record.setBaseQualities(record.getOriginalBaseQualities());
                record.setOriginalBaseQualities(null);
                numOriginalQualitiesRestored++;
            }
            if (!foundPairedMappedReads && record.getReadPairedFlag() && !record.getReadUnmappedFlag())
                foundPairedMappedReads = true;
            revertingProgress.record(record);
            sorter.add(record);
        }
        CloserUtil.close(in);
        logger.info("Reverted the original base qualities for " + numOriginalQualitiesRestored + " records");
        /**
             * Iterator through sorting collection output
             * 1. Set mate cigar string and mate information
             * 2. push record into SAMFileWriter to the output
             */
        try (final SamPairUtil.SetMateInfoIterator sorterIterator = new SamPairUtil.SetMateInfoIterator(sorter.iterator(), true)) {
            final ProgressLogger sorterProgress = new ProgressLogger(logger, 1000000, " mate cigars added");
            while (sorterIterator.hasNext()) {
                final SAMRecord record = sorterIterator.next();
                out.addAlignment(record);
                sorterProgress.record(record);
            }
            CloserUtil.close(out);
            logger.info("Updated " + sorterIterator.getNumMateCigarsAdded() + " records with mate cigar");
            if (!foundPairedMappedReads)
                logger.info("Did not find any paired mapped reads.");
        }
    }
    return null;
}
Also used : SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamPairUtil(htsjdk.samtools.SamPairUtil) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) BAMRecordCodec(htsjdk.samtools.BAMRecordCodec) SAMRecord(htsjdk.samtools.SAMRecord) SAMRecordQueryNameComparator(htsjdk.samtools.SAMRecordQueryNameComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 2 with SAMRecordQueryNameComparator

use of htsjdk.samtools.SAMRecordQueryNameComparator in project gatk by broadinstitute.

the class RevertSam method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean sanitizing = SANITIZE;
    final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
    final SAMFileHeader inHeader = in.getFileHeader();
    // If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
    // groups have the same values.
    final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
    if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
        boolean allSampleAliasesIdentical = true;
        boolean allLibraryNamesIdentical = true;
        for (int i = 1; i < rgs.size(); i++) {
            if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
                allSampleAliasesIdentical = false;
            }
            if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
                allLibraryNamesIdentical = false;
            }
        }
        if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
            throw new UserException("Read groups have multiple values for sample.  " + "A value for SAMPLE_ALIAS cannot be supplied.");
        }
        if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
            throw new UserException("Read groups have multiple values for library name.  " + "A value for library name cannot be supplied.");
        }
    }
    ////////////////////////////////////////////////////////////////////////////
    // Build the output writer with an appropriate header based on the options
    ////////////////////////////////////////////////////////////////////////////
    final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
    final SAMFileHeader outHeader = new SAMFileHeader();
    for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
        if (SAMPLE_ALIAS != null) {
            rg.setSample(SAMPLE_ALIAS);
        }
        if (LIBRARY_NAME != null) {
            rg.setLibrary(LIBRARY_NAME);
        }
        outHeader.addReadGroup(rg);
    }
    outHeader.setSortOrder(SORT_ORDER);
    if (!REMOVE_ALIGNMENT_INFORMATION) {
        outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
        outHeader.setProgramRecords(inHeader.getProgramRecords());
    }
    final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
    ////////////////////////////////////////////////////////////////////////////
    // Build a sorting collection to use if we are sanitizing
    ////////////////////////////////////////////////////////////////////////////
    final SortingCollection<SAMRecord> sorter;
    if (sanitizing) {
        sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
    } else {
        sorter = null;
    }
    final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
    for (final SAMRecord rec : in) {
        // Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
        if (rec.isSecondaryOrSupplementary())
            continue;
        // Actually to the reverting of the remaining records
        revertSamRecord(rec);
        if (sanitizing)
            sorter.add(rec);
        else
            out.addAlignment(rec);
        progress.record(rec);
    }
    ////////////////////////////////////////////////////////////////////////////
    if (!sanitizing) {
        out.close();
    } else {
        long total = 0, discarded = 0;
        final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
        final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
        // Figure out the quality score encoding scheme for each read group.
        for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
            final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
            final SamRecordFilter filter = new SamRecordFilter() {

                @Override
                public boolean filterOut(final SAMRecord rec) {
                    return !rec.getReadGroup().getId().equals(rg.getId());
                }

                @Override
                public boolean filterOut(final SAMRecord first, final SAMRecord second) {
                    throw new UnsupportedOperationException();
                }
            };
            readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
            CloserUtil.close(reader);
        }
        for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
            logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
        }
        if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
            logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
            return -1;
        }
        final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
        readNameLoop: while (iterator.hasNext()) {
            final List<SAMRecord> recs = fetchByReadName(iterator);
            total += recs.size();
            // Check that all the reads have bases and qualities of the same length
            for (final SAMRecord rec : recs) {
                if (rec.getReadBases().length != rec.getBaseQualities().length) {
                    logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
                    discarded += recs.size();
                    continue readNameLoop;
                }
            }
            // Check that if the first read is marked as unpaired that there is in fact only one read
            if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
                logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
                discarded += recs.size();
                continue readNameLoop;
            }
            // Check that if we have paired reads there is exactly one first of pair and one second of pair
            if (recs.get(0).getReadPairedFlag()) {
                int firsts = 0, seconds = 0, unpaired = 0;
                for (final SAMRecord rec : recs) {
                    if (!rec.getReadPairedFlag())
                        ++unpaired;
                    if (rec.getFirstOfPairFlag())
                        ++firsts;
                    if (rec.getSecondOfPairFlag())
                        ++seconds;
                }
                if (unpaired > 0 || firsts != 1 || seconds != 1) {
                    logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
                    discarded += recs.size();
                    continue readNameLoop;
                }
            }
            // If we've made it this far spit the records into the output!
            for (final SAMRecord rec : recs) {
                // The only valid quality score encoding scheme is standard; if it's not standard, change it.
                final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
                if (!recordFormat.equals(FastqQualityFormat.Standard)) {
                    final byte[] quals = rec.getBaseQualities();
                    for (int i = 0; i < quals.length; i++) {
                        quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
                    }
                    rec.setBaseQualities(quals);
                }
                out.addAlignment(rec);
                sanitizerProgress.record(rec);
            }
        }
        out.close();
        final double discardRate = discarded / (double) total;
        final NumberFormat fmt = new DecimalFormat("0.000%");
        logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
        if (discarded / (double) total > MAX_DISCARD_FRACTION) {
            throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
        }
    }
    CloserUtil.close(in);
    return null;
}
Also used : HashMap(java.util.HashMap) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) DecimalFormat(java.text.DecimalFormat) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) FilteringSamIterator(htsjdk.samtools.filter.FilteringSamIterator) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) BAMRecordCodec(htsjdk.samtools.BAMRecordCodec) SAMRecord(htsjdk.samtools.SAMRecord) SAMRecordQueryNameComparator(htsjdk.samtools.SAMRecordQueryNameComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) NumberFormat(java.text.NumberFormat)

Example 3 with SAMRecordQueryNameComparator

use of htsjdk.samtools.SAMRecordQueryNameComparator in project gatk by broadinstitute.

the class FixMateInformation method doWork.

@Override
protected Object doWork() {
    // Open up the input
    boolean allQueryNameSorted = true;
    final List<SamReader> readers = new ArrayList<>();
    for (final File f : INPUT) {
        IOUtil.assertFileIsReadable(f);
        final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
        readers.add(reader);
        if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname)
            allQueryNameSorted = false;
    }
    // or into a temporary file that will overwrite the INPUT file eventually
    if (OUTPUT != null)
        OUTPUT = OUTPUT.getAbsoluteFile();
    final boolean differentOutputSpecified = OUTPUT != null;
    if (differentOutputSpecified) {
        IOUtil.assertFileIsWritable(OUTPUT);
    } else if (INPUT.size() != 1) {
        throw new UserException("Must specify either an explicit OUTPUT file or a single INPUT file to be overridden.");
    } else {
        final File soleInput = INPUT.get(0).getAbsoluteFile();
        final File dir = soleInput.getParentFile().getAbsoluteFile();
        try {
            IOUtil.assertFileIsWritable(soleInput);
            IOUtil.assertDirectoryIsWritable(dir);
            OUTPUT = File.createTempFile(soleInput.getName() + ".being_fixed.", BamFileIoUtils.BAM_FILE_EXTENSION, dir);
        } catch (final IOException ioe) {
            throw new RuntimeIOException("Could not create tmp file in " + dir.getAbsolutePath());
        }
    }
    // Get the input records merged and sorted by query name as needed
    final PeekableIterator<SAMRecord> iterator;
    final SAMFileHeader header;
    {
        // Deal with merging if necessary
        final Iterator<SAMRecord> tmp;
        if (INPUT.size() > 1) {
            final List<SAMFileHeader> headers = new ArrayList<>(readers.size());
            for (final SamReader reader : readers) {
                headers.add(reader.getFileHeader());
            }
            final SAMFileHeader.SortOrder sortOrder = (allQueryNameSorted ? SAMFileHeader.SortOrder.queryname : SAMFileHeader.SortOrder.unsorted);
            final SamFileHeaderMerger merger = new SamFileHeaderMerger(sortOrder, headers, false);
            tmp = new MergingSamRecordIterator(merger, readers, false);
            header = merger.getMergedHeader();
        } else {
            tmp = readers.get(0).iterator();
            header = readers.get(0).getFileHeader();
        }
        // And now deal with re-sorting if necessary
        if (ASSUME_SORTED || allQueryNameSorted) {
            iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<>(tmp), ADD_MATE_CIGAR);
        } else {
            logger.info("Sorting input into queryname order.");
            final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
            while (tmp.hasNext()) {
                sorter.add(tmp.next());
            }
            iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<SAMRecord>(sorter.iterator()) {

                @Override
                public void close() {
                    super.close();
                    sorter.cleanup();
                }
            }, ADD_MATE_CIGAR);
            logger.info("Sorting by queryname complete.");
        }
        // Deal with the various sorting complications
        final SAMFileHeader.SortOrder outputSortOrder = SORT_ORDER == null ? readers.get(0).getFileHeader().getSortOrder() : SORT_ORDER;
        logger.info("Output will be sorted by " + outputSortOrder);
        header.setSortOrder(outputSortOrder);
    }
    if (CREATE_INDEX && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
        throw new UserException("Can't CREATE_INDEX unless sort order is coordinate");
    }
    try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, header, header.getSortOrder() == SAMFileHeader.SortOrder.queryname)) {
        logger.info("Traversing query name sorted records and fixing up mate pair information.");
        final ProgressLogger progress = new ProgressLogger(logger);
        while (iterator.hasNext()) {
            final SAMRecord record = iterator.next();
            out.addAlignment(record);
            progress.record(record);
        }
        iterator.close();
        if (header.getSortOrder() == SAMFileHeader.SortOrder.queryname) {
            logger.info("Closing output file.");
        } else {
            logger.info("Finished processing reads; re-sorting output file.");
        }
    }
    // TODO throw appropriate exceptions instead of writing to log.error and returning
    if (!differentOutputSpecified) {
        logger.info("Replacing input file with fixed file.");
        final File soleInput = INPUT.get(0).getAbsoluteFile();
        final File old = new File(soleInput.getParentFile(), soleInput.getName() + ".old");
        if (!old.exists() && soleInput.renameTo(old)) {
            if (OUTPUT.renameTo(soleInput)) {
                if (!old.delete()) {
                    logger.warn("Could not delete old file: " + old.getAbsolutePath());
                    return null;
                }
                if (CREATE_INDEX) {
                    final File newIndex = new File(OUTPUT.getParent(), OUTPUT.getName().substring(0, OUTPUT.getName().length() - 4) + ".bai");
                    final File oldIndex = new File(soleInput.getParent(), soleInput.getName().substring(0, soleInput.getName().length() - 4) + ".bai");
                    if (!newIndex.renameTo(oldIndex)) {
                        logger.warn("Could not overwrite index file: " + oldIndex.getAbsolutePath());
                    }
                }
            } else {
                logger.error("Could not move new file to " + soleInput.getAbsolutePath());
                logger.error("Input file preserved as: " + old.getAbsolutePath());
                logger.error("New file preserved as: " + OUTPUT.getAbsolutePath());
                return null;
            }
        } else {
            logger.error("Could not move input file out of the way: " + soleInput.getAbsolutePath());
            if (!OUTPUT.delete()) {
                logger.error("Could not delete temporary file: " + OUTPUT.getAbsolutePath());
            }
            return null;
        }
    }
    CloserUtil.close(readers);
    return null;
}
Also used : MergingSamRecordIterator(htsjdk.samtools.MergingSamRecordIterator) SortingCollection(htsjdk.samtools.util.SortingCollection) ArrayList(java.util.ArrayList) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) MergingSamRecordIterator(htsjdk.samtools.MergingSamRecordIterator) PeekableIterator(htsjdk.samtools.util.PeekableIterator) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) SamFileHeaderMerger(htsjdk.samtools.SamFileHeaderMerger) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) BAMRecordCodec(htsjdk.samtools.BAMRecordCodec) SAMRecord(htsjdk.samtools.SAMRecord) SAMRecordQueryNameComparator(htsjdk.samtools.SAMRecordQueryNameComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Aggregations

BAMRecordCodec (htsjdk.samtools.BAMRecordCodec)3 SAMFileHeader (htsjdk.samtools.SAMFileHeader)3 SAMFileWriter (htsjdk.samtools.SAMFileWriter)3 SAMRecord (htsjdk.samtools.SAMRecord)3 SAMRecordQueryNameComparator (htsjdk.samtools.SAMRecordQueryNameComparator)3 SamReader (htsjdk.samtools.SamReader)3 ProgressLogger (org.broadinstitute.hellbender.utils.runtime.ProgressLogger)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 UserException (org.broadinstitute.hellbender.exceptions.UserException)2 MergingSamRecordIterator (htsjdk.samtools.MergingSamRecordIterator)1 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)1 SamFileHeaderMerger (htsjdk.samtools.SamFileHeaderMerger)1 SamPairUtil (htsjdk.samtools.SamPairUtil)1 FilteringSamIterator (htsjdk.samtools.filter.FilteringSamIterator)1 SamRecordFilter (htsjdk.samtools.filter.SamRecordFilter)1 PeekableIterator (htsjdk.samtools.util.PeekableIterator)1 RuntimeIOException (htsjdk.samtools.util.RuntimeIOException)1 SortingCollection (htsjdk.samtools.util.SortingCollection)1 File (java.io.File)1