Search in sources :

Example 11 with SAMReadGroupRecord

use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.

the class ReadGroupCovariateUnitTest method testSingleRecord.

@Test
public void testSingleRecord() {
    final String id = "MY.ID";
    final String expected = "SAMPLE.1";
    final ReadGroupCovariate covariate = new ReadGroupCovariate(new RecalibrationArgumentCollection(), Arrays.asList(expected));
    SAMReadGroupRecord rg = new SAMReadGroupRecord(id);
    rg.setPlatformUnit(expected);
    runTest(rg, expected, covariate);
}
Also used : RecalibrationArgumentCollection(org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) Test(org.testng.annotations.Test)

Example 12 with SAMReadGroupRecord

use of htsjdk.samtools.SAMReadGroupRecord in project gatk-protected by broadinstitute.

the class ReferenceConfidenceModelUnitTest method setUp.

@BeforeClass
public void setUp() {
    header = ArtificialReadUtils.createArtificialSamHeader(1, 1, 1000);
    rg = new SAMReadGroupRecord(RGID);
    rg.setSample(sample);
    header.addReadGroup(rg);
    parser = new GenomeLocParser(header.getSequenceDictionary());
}
Also used : SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) GenomeLocParser(org.broadinstitute.hellbender.utils.GenomeLocParser) BeforeClass(org.testng.annotations.BeforeClass)

Example 13 with SAMReadGroupRecord

use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.

the class EstimateLibraryComplexity method doWork.

/**
     * Method that does most of the work.  Reads through the input BAM file and extracts the
     * read sequences of each read pair and sorts them via a SortingCollection.  Then traverses
     * the sorted reads and looks at small groups at a time to find duplicates.
     */
@Override
protected Object doWork() {
    for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
    logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
    final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
    final int recordsRead = 0;
    final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
    // Loop through the input files and pick out the read sequences etc.
    final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
    for (final File f : INPUT) {
        final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
        final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
        readGroups.addAll(in.getFileHeader().getReadGroups());
        for (final SAMRecord rec : in) {
            if (!rec.getReadPairedFlag())
                continue;
            if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
                continue;
            }
            PairedReadSequence prs = pendingByName.remove(rec.getReadName());
            if (prs == null) {
                // Make a new paired read object and add RG and physical location information to it
                prs = new PairedReadSequence();
                if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
                    final SAMReadGroupRecord rg = rec.getReadGroup();
                    if (rg != null)
                        prs.setReadGroup((short) readGroups.indexOf(rg));
                }
                pendingByName.put(rec.getReadName(), prs);
            }
            // Read passes quality check if both ends meet the mean quality criteria
            final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
            prs.qualityOk = prs.qualityOk && passesQualityCheck;
            // Get the bases and restore them to their original orientation if necessary
            final byte[] bases = rec.getReadBases();
            if (rec.getReadNegativeStrandFlag())
                SequenceUtil.reverseComplement(bases);
            if (rec.getFirstOfPairFlag()) {
                prs.read1 = bases;
            } else {
                prs.read2 = bases;
            }
            if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
                sorter.add(prs);
            }
            progress.record(rec);
        }
        CloserUtil.close(in);
    }
    logger.info("Finished reading - moving on to scanning for duplicates.");
    // Now go through the sorted reads and attempt to find duplicates
    try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
        final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
        final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
        int groupsProcessed = 0;
        long lastLogTime = System.currentTimeMillis();
        final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
        while (iterator.hasNext()) {
            // Get the next group and split it apart by library
            final List<PairedReadSequence> group = getNextGroup(iterator);
            if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
                final PairedReadSequence prs = group.get(0);
                logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
            } else {
                final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
                // Now process the reads by library
                for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
                    final String library = entry.getKey();
                    final List<PairedReadSequence> seqs = entry.getValue();
                    Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
                    Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
                    if (duplicationHisto == null) {
                        duplicationHisto = new Histogram<>("duplication_group_count", library);
                        opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
                        duplicationHistosByLibrary.put(library, duplicationHisto);
                        opticalHistosByLibrary.put(library, opticalHisto);
                    }
                    // Figure out if any reads within this group are duplicates of one another
                    for (int i = 0; i < seqs.size(); ++i) {
                        final PairedReadSequence lhs = seqs.get(i);
                        if (lhs == null)
                            continue;
                        final List<PairedReadSequence> dupes = new ArrayList<>();
                        for (int j = i + 1; j < seqs.size(); ++j) {
                            final PairedReadSequence rhs = seqs.get(j);
                            if (rhs == null)
                                continue;
                            if (matches(lhs, rhs, MAX_DIFF_RATE)) {
                                dupes.add(rhs);
                                seqs.set(j, null);
                            }
                        }
                        if (!dupes.isEmpty()) {
                            dupes.add(lhs);
                            final int duplicateCount = dupes.size();
                            duplicationHisto.increment(duplicateCount);
                            final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
                            for (final boolean b : flags) {
                                if (b)
                                    opticalHisto.increment(duplicateCount);
                            }
                        } else {
                            duplicationHisto.increment(1);
                        }
                    }
                }
                ++groupsProcessed;
                if (lastLogTime < System.currentTimeMillis() - 60000) {
                    logger.info("Processed " + groupsProcessed + " groups.");
                    lastLogTime = System.currentTimeMillis();
                }
            }
        }
        sorter.cleanup();
        final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
        for (final String library : duplicationHistosByLibrary.keySet()) {
            final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
            final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
            final DuplicationMetrics metrics = new DuplicationMetrics();
            metrics.LIBRARY = library;
            // Filter out any bins that have only a single entry in them and calcu
            for (final Integer bin : duplicationHisto.keySet()) {
                final double duplicateGroups = duplicationHisto.get(bin).getValue();
                final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
                if (duplicateGroups > 1) {
                    metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
                    metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
                    metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
                }
            }
            metrics.calculateDerivedMetrics();
            file.addMetric(metrics);
            file.addHistogram(duplicationHisto);
        }
        file.write(OUTPUT);
    }
    return null;
}
Also used : Histogram(htsjdk.samtools.util.Histogram) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) HashMap(java.util.HashMap) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) ArrayList(java.util.ArrayList) List(java.util.List) SAMRecord(htsjdk.samtools.SAMRecord) PeekableIterator(htsjdk.samtools.util.PeekableIterator) MetricsFile(htsjdk.samtools.metrics.MetricsFile) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map)

Example 14 with SAMReadGroupRecord

use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.

the class RevertSam method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean sanitizing = SANITIZE;
    final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
    final SAMFileHeader inHeader = in.getFileHeader();
    // If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
    // groups have the same values.
    final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
    if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
        boolean allSampleAliasesIdentical = true;
        boolean allLibraryNamesIdentical = true;
        for (int i = 1; i < rgs.size(); i++) {
            if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
                allSampleAliasesIdentical = false;
            }
            if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
                allLibraryNamesIdentical = false;
            }
        }
        if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
            throw new UserException("Read groups have multiple values for sample.  " + "A value for SAMPLE_ALIAS cannot be supplied.");
        }
        if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
            throw new UserException("Read groups have multiple values for library name.  " + "A value for library name cannot be supplied.");
        }
    }
    ////////////////////////////////////////////////////////////////////////////
    // Build the output writer with an appropriate header based on the options
    ////////////////////////////////////////////////////////////////////////////
    final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
    final SAMFileHeader outHeader = new SAMFileHeader();
    for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
        if (SAMPLE_ALIAS != null) {
            rg.setSample(SAMPLE_ALIAS);
        }
        if (LIBRARY_NAME != null) {
            rg.setLibrary(LIBRARY_NAME);
        }
        outHeader.addReadGroup(rg);
    }
    outHeader.setSortOrder(SORT_ORDER);
    if (!REMOVE_ALIGNMENT_INFORMATION) {
        outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
        outHeader.setProgramRecords(inHeader.getProgramRecords());
    }
    final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
    ////////////////////////////////////////////////////////////////////////////
    // Build a sorting collection to use if we are sanitizing
    ////////////////////////////////////////////////////////////////////////////
    final SortingCollection<SAMRecord> sorter;
    if (sanitizing) {
        sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
    } else {
        sorter = null;
    }
    final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
    for (final SAMRecord rec : in) {
        // Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
        if (rec.isSecondaryOrSupplementary())
            continue;
        // Actually to the reverting of the remaining records
        revertSamRecord(rec);
        if (sanitizing)
            sorter.add(rec);
        else
            out.addAlignment(rec);
        progress.record(rec);
    }
    ////////////////////////////////////////////////////////////////////////////
    if (!sanitizing) {
        out.close();
    } else {
        long total = 0, discarded = 0;
        final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
        final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
        // Figure out the quality score encoding scheme for each read group.
        for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
            final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
            final SamRecordFilter filter = new SamRecordFilter() {

                @Override
                public boolean filterOut(final SAMRecord rec) {
                    return !rec.getReadGroup().getId().equals(rg.getId());
                }

                @Override
                public boolean filterOut(final SAMRecord first, final SAMRecord second) {
                    throw new UnsupportedOperationException();
                }
            };
            readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
            CloserUtil.close(reader);
        }
        for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
            logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
        }
        if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
            logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
            return -1;
        }
        final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
        readNameLoop: while (iterator.hasNext()) {
            final List<SAMRecord> recs = fetchByReadName(iterator);
            total += recs.size();
            // Check that all the reads have bases and qualities of the same length
            for (final SAMRecord rec : recs) {
                if (rec.getReadBases().length != rec.getBaseQualities().length) {
                    logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
                    discarded += recs.size();
                    continue readNameLoop;
                }
            }
            // Check that if the first read is marked as unpaired that there is in fact only one read
            if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
                logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
                discarded += recs.size();
                continue readNameLoop;
            }
            // Check that if we have paired reads there is exactly one first of pair and one second of pair
            if (recs.get(0).getReadPairedFlag()) {
                int firsts = 0, seconds = 0, unpaired = 0;
                for (final SAMRecord rec : recs) {
                    if (!rec.getReadPairedFlag())
                        ++unpaired;
                    if (rec.getFirstOfPairFlag())
                        ++firsts;
                    if (rec.getSecondOfPairFlag())
                        ++seconds;
                }
                if (unpaired > 0 || firsts != 1 || seconds != 1) {
                    logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
                    discarded += recs.size();
                    continue readNameLoop;
                }
            }
            // If we've made it this far spit the records into the output!
            for (final SAMRecord rec : recs) {
                // The only valid quality score encoding scheme is standard; if it's not standard, change it.
                final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
                if (!recordFormat.equals(FastqQualityFormat.Standard)) {
                    final byte[] quals = rec.getBaseQualities();
                    for (int i = 0; i < quals.length; i++) {
                        quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
                    }
                    rec.setBaseQualities(quals);
                }
                out.addAlignment(rec);
                sanitizerProgress.record(rec);
            }
        }
        out.close();
        final double discardRate = discarded / (double) total;
        final NumberFormat fmt = new DecimalFormat("0.000%");
        logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
        if (discarded / (double) total > MAX_DISCARD_FRACTION) {
            throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
        }
    }
    CloserUtil.close(in);
    return null;
}
Also used : HashMap(java.util.HashMap) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) DecimalFormat(java.text.DecimalFormat) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) FilteringSamIterator(htsjdk.samtools.filter.FilteringSamIterator) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) BAMRecordCodec(htsjdk.samtools.BAMRecordCodec) SAMRecord(htsjdk.samtools.SAMRecord) SAMRecordQueryNameComparator(htsjdk.samtools.SAMRecordQueryNameComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) NumberFormat(java.text.NumberFormat)

Example 15 with SAMReadGroupRecord

use of htsjdk.samtools.SAMReadGroupRecord in project gatk by broadinstitute.

the class ReferenceConfidenceModelUnitTest method setUp.

@BeforeClass
public void setUp() {
    header = ArtificialReadUtils.createArtificialSamHeader(1, 1, 1000);
    rg = new SAMReadGroupRecord(RGID);
    rg.setSample(sample);
    header.addReadGroup(rg);
    parser = new GenomeLocParser(header.getSequenceDictionary());
}
Also used : SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) GenomeLocParser(org.broadinstitute.hellbender.utils.GenomeLocParser) BeforeClass(org.testng.annotations.BeforeClass)

Aggregations

SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)81 SAMFileHeader (htsjdk.samtools.SAMFileHeader)48 SAMRecord (htsjdk.samtools.SAMRecord)33 Test (org.testng.annotations.Test)31 SamReader (htsjdk.samtools.SamReader)29 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)26 File (java.io.File)23 ArrayList (java.util.ArrayList)22 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)20 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)20 HashMap (java.util.HashMap)18 CigarElement (htsjdk.samtools.CigarElement)17 Cigar (htsjdk.samtools.Cigar)16 HashSet (java.util.HashSet)16 SAMFileWriter (htsjdk.samtools.SAMFileWriter)15 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)15 CigarOperator (htsjdk.samtools.CigarOperator)14 IOException (java.io.IOException)14 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)13 List (java.util.List)12