Search in sources :

Example 1 with FilteringSamIterator

use of htsjdk.samtools.filter.FilteringSamIterator in project gatk by broadinstitute.

the class AbstractAlignmentMerger method mergeAlignment.

/**
     * /**
     * Merges the alignment data with the non-aligned records from the source BAM file.
     */
public void mergeAlignment(final File referenceFasta) {
    // Open the file of unmapped records and write the read groups to the the header for the merged file
    final SamReader unmappedSam = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(this.unmappedBamFile);
    final CloseableIterator<SAMRecord> unmappedIterator = unmappedSam.iterator();
    this.header.setReadGroups(unmappedSam.getFileHeader().getReadGroups());
    int aligned = 0;
    int unmapped = 0;
    // Get the aligned records and set up the first one
    alignedIterator = new MultiHitAlignedReadIterator(new FilteringSamIterator(getQuerynameSortedAlignedRecords(), alignmentFilter), primaryAlignmentSelectionStrategy);
    HitsForInsert nextAligned = nextAligned();
    // sets the program group
    if (getProgramRecord() != null) {
        for (final SAMProgramRecord pg : unmappedSam.getFileHeader().getProgramRecords()) {
            if (pg.getId().equals(getProgramRecord().getId())) {
                throw new GATKException("Program Record ID already in use in unmapped BAM file.");
            }
        }
    }
    // Create the sorting collection that will write the records in the coordinate order
    // to the final bam file
    final SortingCollection<SAMRecord> sorted = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator(), MAX_RECORDS_IN_RAM);
    while (unmappedIterator.hasNext()) {
        // Load next unaligned read or read pair.
        final SAMRecord rec = unmappedIterator.next();
        rec.setHeaderStrict(this.header);
        maybeSetPgTag(rec);
        final SAMRecord secondOfPair;
        if (rec.getReadPairedFlag()) {
            secondOfPair = unmappedIterator.next();
            secondOfPair.setHeaderStrict(this.header);
            maybeSetPgTag(secondOfPair);
            // Validate that paired reads arrive as first of pair followed by second of pair
            if (!rec.getReadName().equals(secondOfPair.getReadName()))
                throw new GATKException("Second read from pair not found in unmapped bam: " + rec.getReadName() + ", " + secondOfPair.getReadName());
            if (!rec.getFirstOfPairFlag())
                throw new GATKException("First record in unmapped bam is not first of pair: " + rec.getReadName());
            if (!secondOfPair.getReadPairedFlag())
                throw new GATKException("Second record in unmapped bam is not marked as paired: " + secondOfPair.getReadName());
            if (!secondOfPair.getSecondOfPairFlag())
                throw new GATKException("Second record in unmapped bam is not second of pair: " + secondOfPair.getReadName());
        } else {
            secondOfPair = null;
        }
        // See if there are alignments for current unaligned read or read pair.
        if (nextAligned != null && rec.getReadName().equals(nextAligned.getReadName())) {
            // If there are multiple alignments for a read (pair), then the unaligned SAMRecord must be cloned
            // before copying info from the aligned record to the unaligned.
            final boolean clone = nextAligned.numHits() > 1 || nextAligned.hasSupplementalHits();
            SAMRecord r1Primary = null, r2Primary = null;
            if (rec.getReadPairedFlag()) {
                for (int i = 0; i < nextAligned.numHits(); ++i) {
                    // firstAligned or secondAligned may be null, if there wasn't an alignment for the end,
                    // or if the alignment was rejected by ignoreAlignment.
                    final SAMRecord firstAligned = nextAligned.getFirstOfPair(i);
                    final SAMRecord secondAligned = nextAligned.getSecondOfPair(i);
                    final boolean isPrimaryAlignment = (firstAligned != null && !firstAligned.isSecondaryOrSupplementary()) || (secondAligned != null && !secondAligned.isSecondaryOrSupplementary());
                    final SAMRecord firstToWrite;
                    final SAMRecord secondToWrite;
                    if (clone) {
                        firstToWrite = ReadUtils.cloneSAMRecord(rec);
                        secondToWrite = ReadUtils.cloneSAMRecord(secondOfPair);
                    } else {
                        firstToWrite = rec;
                        secondToWrite = secondOfPair;
                    }
                    // If these are the primary alignments then stash them for use on any supplemental alignments
                    if (isPrimaryAlignment) {
                        r1Primary = firstToWrite;
                        r2Primary = secondToWrite;
                    }
                    transferAlignmentInfoToPairedRead(firstToWrite, secondToWrite, firstAligned, secondAligned);
                    // Only write unmapped read when it has the mate info from the primary alignment.
                    if (!firstToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
                        addIfNotFiltered(sorted, firstToWrite);
                        if (firstToWrite.getReadUnmappedFlag())
                            ++unmapped;
                        else
                            ++aligned;
                    }
                    if (!secondToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
                        addIfNotFiltered(sorted, secondToWrite);
                        if (!secondToWrite.getReadUnmappedFlag())
                            ++aligned;
                        else
                            ++unmapped;
                    }
                }
                // Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
                for (final boolean isRead1 : new boolean[] { true, false }) {
                    final List<SAMRecord> supplementals = isRead1 ? nextAligned.getSupplementalFirstOfPairOrFragment() : nextAligned.getSupplementalSecondOfPair();
                    final SAMRecord sourceRec = isRead1 ? rec : secondOfPair;
                    final SAMRecord matePrimary = isRead1 ? r2Primary : r1Primary;
                    for (final SAMRecord supp : supplementals) {
                        final SAMRecord out = ReadUtils.cloneSAMRecord(sourceRec);
                        transferAlignmentInfoToFragment(out, supp);
                        if (matePrimary != null)
                            SamPairUtil.setMateInformationOnSupplementalAlignment(out, matePrimary, addMateCigar);
                        ++aligned;
                        addIfNotFiltered(sorted, out);
                    }
                }
            } else {
                for (int i = 0; i < nextAligned.numHits(); ++i) {
                    final SAMRecord recToWrite = clone ? ReadUtils.cloneSAMRecord(rec) : rec;
                    transferAlignmentInfoToFragment(recToWrite, nextAligned.getFragment(i));
                    addIfNotFiltered(sorted, recToWrite);
                    if (recToWrite.getReadUnmappedFlag())
                        ++unmapped;
                    else
                        ++aligned;
                }
                // Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
                for (final SAMRecord supplementalRec : nextAligned.getSupplementalFirstOfPairOrFragment()) {
                    final SAMRecord recToWrite = ReadUtils.cloneSAMRecord(rec);
                    transferAlignmentInfoToFragment(recToWrite, supplementalRec);
                    addIfNotFiltered(sorted, recToWrite);
                    ++aligned;
                }
            }
            nextAligned = nextAligned();
        } else {
            // There was no alignment for this read or read pair.
            if (nextAligned != null && SAMRecordQueryNameComparator.compareReadNames(rec.getReadName(), nextAligned.getReadName()) > 0) {
                throw new IllegalStateException("Aligned record iterator (" + nextAligned.getReadName() + ") is behind the unmapped reads (" + rec.getReadName() + ")");
            }
            // No matching read from alignedIterator -- just output reads as is.
            if (!alignedReadsOnly) {
                sorted.add(rec);
                ++unmapped;
                if (secondOfPair != null) {
                    sorted.add(secondOfPair);
                    ++unmapped;
                }
            }
        }
    }
    unmappedIterator.close();
    Utils.validate(!alignedIterator.hasNext(), () -> "Reads remaining on alignment iterator: " + alignedIterator.next().getReadName() + "!");
    alignedIterator.close();
    // Write the records to the output file in specified sorted order,
    header.setSortOrder(this.sortOrder);
    final boolean presorted = this.sortOrder == SAMFileHeader.SortOrder.coordinate;
    final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, presorted, this.targetBamFile, referenceFasta);
    writer.setProgressLogger(new ProgressLogger(logger, (int) 1e7, "Wrote", "records from a sorting collection"));
    final ProgressLogger finalProgress = new ProgressLogger(logger, 10000000, "Written in coordinate order to output", "records");
    for (final SAMRecord rec : sorted) {
        if (!rec.getReadUnmappedFlag()) {
            if (refSeq != null) {
                final byte[] referenceBases = refSeq.get(sequenceDictionary.getSequenceIndex(rec.getReferenceName())).getBases();
                rec.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(rec, referenceBases, 0, bisulfiteSequence));
                if (rec.getBaseQualities() != SAMRecord.NULL_QUALS) {
                    rec.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(rec, referenceBases, 0, bisulfiteSequence));
                }
            }
        }
        writer.addAlignment(rec);
        finalProgress.record(rec);
    }
    writer.close();
    sorted.cleanup();
    CloserUtil.close(unmappedSam);
    logger.info("Wrote " + aligned + " alignment records and " + (alignedReadsOnly ? 0 : unmapped) + " unmapped reads.");
}
Also used : ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) FilteringSamIterator(htsjdk.samtools.filter.FilteringSamIterator) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Example 2 with FilteringSamIterator

use of htsjdk.samtools.filter.FilteringSamIterator in project gatk by broadinstitute.

the class RevertSam method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean sanitizing = SANITIZE;
    final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
    final SAMFileHeader inHeader = in.getFileHeader();
    // If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
    // groups have the same values.
    final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
    if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
        boolean allSampleAliasesIdentical = true;
        boolean allLibraryNamesIdentical = true;
        for (int i = 1; i < rgs.size(); i++) {
            if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
                allSampleAliasesIdentical = false;
            }
            if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
                allLibraryNamesIdentical = false;
            }
        }
        if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
            throw new UserException("Read groups have multiple values for sample.  " + "A value for SAMPLE_ALIAS cannot be supplied.");
        }
        if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
            throw new UserException("Read groups have multiple values for library name.  " + "A value for library name cannot be supplied.");
        }
    }
    ////////////////////////////////////////////////////////////////////////////
    // Build the output writer with an appropriate header based on the options
    ////////////////////////////////////////////////////////////////////////////
    final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
    final SAMFileHeader outHeader = new SAMFileHeader();
    for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
        if (SAMPLE_ALIAS != null) {
            rg.setSample(SAMPLE_ALIAS);
        }
        if (LIBRARY_NAME != null) {
            rg.setLibrary(LIBRARY_NAME);
        }
        outHeader.addReadGroup(rg);
    }
    outHeader.setSortOrder(SORT_ORDER);
    if (!REMOVE_ALIGNMENT_INFORMATION) {
        outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
        outHeader.setProgramRecords(inHeader.getProgramRecords());
    }
    final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
    ////////////////////////////////////////////////////////////////////////////
    // Build a sorting collection to use if we are sanitizing
    ////////////////////////////////////////////////////////////////////////////
    final SortingCollection<SAMRecord> sorter;
    if (sanitizing) {
        sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
    } else {
        sorter = null;
    }
    final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
    for (final SAMRecord rec : in) {
        // Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
        if (rec.isSecondaryOrSupplementary())
            continue;
        // Actually to the reverting of the remaining records
        revertSamRecord(rec);
        if (sanitizing)
            sorter.add(rec);
        else
            out.addAlignment(rec);
        progress.record(rec);
    }
    ////////////////////////////////////////////////////////////////////////////
    if (!sanitizing) {
        out.close();
    } else {
        long total = 0, discarded = 0;
        final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
        final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
        // Figure out the quality score encoding scheme for each read group.
        for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
            final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
            final SamRecordFilter filter = new SamRecordFilter() {

                @Override
                public boolean filterOut(final SAMRecord rec) {
                    return !rec.getReadGroup().getId().equals(rg.getId());
                }

                @Override
                public boolean filterOut(final SAMRecord first, final SAMRecord second) {
                    throw new UnsupportedOperationException();
                }
            };
            readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
            CloserUtil.close(reader);
        }
        for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
            logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
        }
        if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
            logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
            return -1;
        }
        final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
        readNameLoop: while (iterator.hasNext()) {
            final List<SAMRecord> recs = fetchByReadName(iterator);
            total += recs.size();
            // Check that all the reads have bases and qualities of the same length
            for (final SAMRecord rec : recs) {
                if (rec.getReadBases().length != rec.getBaseQualities().length) {
                    logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
                    discarded += recs.size();
                    continue readNameLoop;
                }
            }
            // Check that if the first read is marked as unpaired that there is in fact only one read
            if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
                logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
                discarded += recs.size();
                continue readNameLoop;
            }
            // Check that if we have paired reads there is exactly one first of pair and one second of pair
            if (recs.get(0).getReadPairedFlag()) {
                int firsts = 0, seconds = 0, unpaired = 0;
                for (final SAMRecord rec : recs) {
                    if (!rec.getReadPairedFlag())
                        ++unpaired;
                    if (rec.getFirstOfPairFlag())
                        ++firsts;
                    if (rec.getSecondOfPairFlag())
                        ++seconds;
                }
                if (unpaired > 0 || firsts != 1 || seconds != 1) {
                    logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
                    discarded += recs.size();
                    continue readNameLoop;
                }
            }
            // If we've made it this far spit the records into the output!
            for (final SAMRecord rec : recs) {
                // The only valid quality score encoding scheme is standard; if it's not standard, change it.
                final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
                if (!recordFormat.equals(FastqQualityFormat.Standard)) {
                    final byte[] quals = rec.getBaseQualities();
                    for (int i = 0; i < quals.length; i++) {
                        quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
                    }
                    rec.setBaseQualities(quals);
                }
                out.addAlignment(rec);
                sanitizerProgress.record(rec);
            }
        }
        out.close();
        final double discardRate = discarded / (double) total;
        final NumberFormat fmt = new DecimalFormat("0.000%");
        logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
        if (discarded / (double) total > MAX_DISCARD_FRACTION) {
            throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
        }
    }
    CloserUtil.close(in);
    return null;
}
Also used : HashMap(java.util.HashMap) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) DecimalFormat(java.text.DecimalFormat) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) FilteringSamIterator(htsjdk.samtools.filter.FilteringSamIterator) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) BAMRecordCodec(htsjdk.samtools.BAMRecordCodec) SAMRecord(htsjdk.samtools.SAMRecord) SAMRecordQueryNameComparator(htsjdk.samtools.SAMRecordQueryNameComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) NumberFormat(java.text.NumberFormat)

Example 3 with FilteringSamIterator

use of htsjdk.samtools.filter.FilteringSamIterator in project gatk by broadinstitute.

the class FilterReads method doWork.

@Override
protected Object doWork() {
    try {
        IOUtil.assertFileIsReadable(INPUT);
        IOUtil.assertFileIsWritable(OUTPUT);
        if (WRITE_READS_FILES)
            writeReadsFile(INPUT);
        switch(FILTER) {
            case includeAligned:
                filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new AlignedFilter(true), true));
                break;
            case excludeAligned:
                filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new AlignedFilter(false), true));
                break;
            case includeReadList:
                filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new ReadNameFilter(READ_LIST_FILE, true)));
                break;
            case excludeReadList:
                filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new ReadNameFilter(READ_LIST_FILE, false)));
                break;
            default:
                throw new UnsupportedOperationException(FILTER.name() + " has not been implemented!");
        }
        IOUtil.assertFileIsReadable(OUTPUT);
        if (WRITE_READS_FILES)
            writeReadsFile(OUTPUT);
    } catch (IOException e) {
        if (OUTPUT.exists() && !OUTPUT.delete()) {
            throw new UserException("Failed to delete existing output: " + OUTPUT.getAbsolutePath());
        } else {
            throw new UserException("Failed to filter " + INPUT.getName());
        }
    }
    return null;
}
Also used : AlignedFilter(htsjdk.samtools.filter.AlignedFilter) FilteringSamIterator(htsjdk.samtools.filter.FilteringSamIterator) ReadNameFilter(htsjdk.samtools.filter.ReadNameFilter) IOException(java.io.IOException) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Aggregations

FilteringSamIterator (htsjdk.samtools.filter.FilteringSamIterator)3 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)2 UserException (org.broadinstitute.hellbender.exceptions.UserException)2 ProgressLogger (org.broadinstitute.hellbender.utils.runtime.ProgressLogger)2 BAMRecordCodec (htsjdk.samtools.BAMRecordCodec)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMFileWriter (htsjdk.samtools.SAMFileWriter)1 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)1 SAMRecord (htsjdk.samtools.SAMRecord)1 SAMRecordQueryNameComparator (htsjdk.samtools.SAMRecordQueryNameComparator)1 SamReader (htsjdk.samtools.SamReader)1 AlignedFilter (htsjdk.samtools.filter.AlignedFilter)1 ReadNameFilter (htsjdk.samtools.filter.ReadNameFilter)1 SamRecordFilter (htsjdk.samtools.filter.SamRecordFilter)1 IOException (java.io.IOException)1 DecimalFormat (java.text.DecimalFormat)1 NumberFormat (java.text.NumberFormat)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1