Search in sources :

Example 1 with DuplicationMetrics

use of org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics in project gatk by broadinstitute.

the class MarkDuplicatesSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    JavaRDD<GATKRead> reads = getReads();
    final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
    final JavaRDD<GATKRead> finalReadsForMetrics = mark(reads, getHeaderForReads(), duplicatesScoringStrategy, finder, getRecommendedNumReducers());
    if (metricsFile != null) {
        final JavaPairRDD<String, DuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics(getHeaderForReads(), finalReadsForMetrics);
        final MetricsFile<DuplicationMetrics, Double> resultMetrics = getMetricsFile();
        MarkDuplicatesSparkUtils.saveMetricsRDD(resultMetrics, getHeaderForReads(), metricsByLibrary, metricsFile, getAuthHolder());
    }
    final JavaRDD<GATKRead> finalReads = cleanupTemporaryAttributes(finalReadsForMetrics);
    writeReads(ctx, output, finalReads);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) OpticalDuplicateFinder(org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder)

Example 2 with DuplicationMetrics

use of org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics in project gatk by broadinstitute.

the class EstimateLibraryComplexity method doWork.

/**
     * Method that does most of the work.  Reads through the input BAM file and extracts the
     * read sequences of each read pair and sorts them via a SortingCollection.  Then traverses
     * the sorted reads and looks at small groups at a time to find duplicates.
     */
@Override
protected Object doWork() {
    for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
    logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
    final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
    final int recordsRead = 0;
    final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
    // Loop through the input files and pick out the read sequences etc.
    final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
    for (final File f : INPUT) {
        final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
        final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
        readGroups.addAll(in.getFileHeader().getReadGroups());
        for (final SAMRecord rec : in) {
            if (!rec.getReadPairedFlag())
                continue;
            if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
                continue;
            }
            PairedReadSequence prs = pendingByName.remove(rec.getReadName());
            if (prs == null) {
                // Make a new paired read object and add RG and physical location information to it
                prs = new PairedReadSequence();
                if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
                    final SAMReadGroupRecord rg = rec.getReadGroup();
                    if (rg != null)
                        prs.setReadGroup((short) readGroups.indexOf(rg));
                }
                pendingByName.put(rec.getReadName(), prs);
            }
            // Read passes quality check if both ends meet the mean quality criteria
            final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
            prs.qualityOk = prs.qualityOk && passesQualityCheck;
            // Get the bases and restore them to their original orientation if necessary
            final byte[] bases = rec.getReadBases();
            if (rec.getReadNegativeStrandFlag())
                SequenceUtil.reverseComplement(bases);
            if (rec.getFirstOfPairFlag()) {
                prs.read1 = bases;
            } else {
                prs.read2 = bases;
            }
            if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
                sorter.add(prs);
            }
            progress.record(rec);
        }
        CloserUtil.close(in);
    }
    logger.info("Finished reading - moving on to scanning for duplicates.");
    // Now go through the sorted reads and attempt to find duplicates
    try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
        final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
        final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
        int groupsProcessed = 0;
        long lastLogTime = System.currentTimeMillis();
        final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
        while (iterator.hasNext()) {
            // Get the next group and split it apart by library
            final List<PairedReadSequence> group = getNextGroup(iterator);
            if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
                final PairedReadSequence prs = group.get(0);
                logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
            } else {
                final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
                // Now process the reads by library
                for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
                    final String library = entry.getKey();
                    final List<PairedReadSequence> seqs = entry.getValue();
                    Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
                    Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
                    if (duplicationHisto == null) {
                        duplicationHisto = new Histogram<>("duplication_group_count", library);
                        opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
                        duplicationHistosByLibrary.put(library, duplicationHisto);
                        opticalHistosByLibrary.put(library, opticalHisto);
                    }
                    // Figure out if any reads within this group are duplicates of one another
                    for (int i = 0; i < seqs.size(); ++i) {
                        final PairedReadSequence lhs = seqs.get(i);
                        if (lhs == null)
                            continue;
                        final List<PairedReadSequence> dupes = new ArrayList<>();
                        for (int j = i + 1; j < seqs.size(); ++j) {
                            final PairedReadSequence rhs = seqs.get(j);
                            if (rhs == null)
                                continue;
                            if (matches(lhs, rhs, MAX_DIFF_RATE)) {
                                dupes.add(rhs);
                                seqs.set(j, null);
                            }
                        }
                        if (!dupes.isEmpty()) {
                            dupes.add(lhs);
                            final int duplicateCount = dupes.size();
                            duplicationHisto.increment(duplicateCount);
                            final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
                            for (final boolean b : flags) {
                                if (b)
                                    opticalHisto.increment(duplicateCount);
                            }
                        } else {
                            duplicationHisto.increment(1);
                        }
                    }
                }
                ++groupsProcessed;
                if (lastLogTime < System.currentTimeMillis() - 60000) {
                    logger.info("Processed " + groupsProcessed + " groups.");
                    lastLogTime = System.currentTimeMillis();
                }
            }
        }
        sorter.cleanup();
        final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
        for (final String library : duplicationHistosByLibrary.keySet()) {
            final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
            final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
            final DuplicationMetrics metrics = new DuplicationMetrics();
            metrics.LIBRARY = library;
            // Filter out any bins that have only a single entry in them and calcu
            for (final Integer bin : duplicationHisto.keySet()) {
                final double duplicateGroups = duplicationHisto.get(bin).getValue();
                final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
                if (duplicateGroups > 1) {
                    metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
                    metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
                    metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
                }
            }
            metrics.calculateDerivedMetrics();
            file.addMetric(metrics);
            file.addHistogram(duplicationHisto);
        }
        file.write(OUTPUT);
    }
    return null;
}
Also used : Histogram(htsjdk.samtools.util.Histogram) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) HashMap(java.util.HashMap) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) SamReader(htsjdk.samtools.SamReader) ArrayList(java.util.ArrayList) List(java.util.List) SAMRecord(htsjdk.samtools.SAMRecord) PeekableIterator(htsjdk.samtools.util.PeekableIterator) MetricsFile(htsjdk.samtools.metrics.MetricsFile) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with DuplicationMetrics

use of org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics in project gatk by broadinstitute.

the class AbstractMarkDuplicatesTester method test.

@Override
public void test() {
    try {
        updateExpectedDuplicationMetrics();
        // Read the output and check the duplicate flag
        int outputRecords = 0;
        final SamReader reader = SamReaderFactory.makeDefault().open(getOutput());
        for (final SAMRecord record : reader) {
            outputRecords++;
            final String key = samRecordToDuplicatesFlagsKey(record);
            if (!this.duplicateFlags.containsKey(key)) {
                System.err.println("DOES NOT CONTAIN KEY: " + key);
            }
            Assert.assertTrue(this.duplicateFlags.containsKey(key));
            final boolean value = this.duplicateFlags.get(key);
            this.duplicateFlags.remove(key);
            if (value != record.getDuplicateReadFlag()) {
                System.err.println("Mismatching read:");
                System.err.print(record.getSAMString());
            }
            Assert.assertEquals(record.getDuplicateReadFlag(), value);
        }
        CloserUtil.close(reader);
        // Ensure the program output the same number of records as were read in
        Assert.assertEquals(outputRecords, this.getNumberOfRecords(), ("saw " + outputRecords + " output records, vs. " + this.getNumberOfRecords() + " input records"));
        // Check the values written to metrics.txt against our input expectations
        final MetricsFile<DuplicationMetrics, Comparable<?>> metricsOutput = new MetricsFile<>();
        try {
            metricsOutput.read(new FileReader(metricsFile));
        } catch (final FileNotFoundException ex) {
            System.err.println("Metrics file not found: " + ex);
        }
        // NB: Test writes an initial metrics line with a null entry for LIBRARY and 0 values for all metrics. So we find the first non-null one
        final DuplicationMetrics observedMetrics = metricsOutput.getMetrics().stream().filter(metric -> metric.LIBRARY != null).findFirst().get();
        Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedMetrics.UNPAIRED_READS_EXAMINED, "UNPAIRED_READS_EXAMINED does not match expected");
        Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedMetrics.READ_PAIRS_EXAMINED, "READ_PAIRS_EXAMINED does not match expected");
        Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedMetrics.UNMAPPED_READS, "UNMAPPED_READS does not match expected");
        Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedMetrics.UNPAIRED_READ_DUPLICATES, "UNPAIRED_READ_DUPLICATES does not match expected");
        Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedMetrics.READ_PAIR_DUPLICATES, "READ_PAIR_DUPLICATES does not match expected");
        Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match expected");
        Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedMetrics.PERCENT_DUPLICATION, "PERCENT_DUPLICATION does not match expected");
        // coded and needs to have real values for each field.
        if (observedMetrics.ESTIMATED_LIBRARY_SIZE == null) {
            observedMetrics.ESTIMATED_LIBRARY_SIZE = 0L;
        }
        if (expectedMetrics.ESTIMATED_LIBRARY_SIZE == null) {
            expectedMetrics.ESTIMATED_LIBRARY_SIZE = 0L;
        }
        Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedMetrics.ESTIMATED_LIBRARY_SIZE, "ESTIMATED_LIBRARY_SIZE does not match expected");
    } finally {
        TestUtil.recursiveDelete(getOutputDir());
    }
}
Also used : MetricsFile(htsjdk.samtools.metrics.MetricsFile) SamReader(htsjdk.samtools.SamReader) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) SAMRecord(htsjdk.samtools.SAMRecord) FileNotFoundException(java.io.FileNotFoundException) FileReader(java.io.FileReader)

Example 4 with DuplicationMetrics

use of org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics in project gatk by broadinstitute.

the class MarkDuplicatesSparkIntegrationTest method testMarkDuplicatesSparkIntegrationTestLocal.

@Test(groups = "spark", dataProvider = "md")
public void testMarkDuplicatesSparkIntegrationTestLocal(final File input, final long totalExpected, final long dupsExpected, Map<String, List<String>> metricsExpected) throws IOException {
    ArgumentsBuilder args = new ArgumentsBuilder();
    args.add("--" + StandardArgumentDefinitions.INPUT_LONG_NAME);
    args.add(input.getPath());
    args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME);
    File outputFile = createTempFile("markdups", ".bam");
    outputFile.delete();
    args.add(outputFile.getAbsolutePath());
    args.add("--METRICS_FILE");
    File metricsFile = createTempFile("markdups_metrics", ".txt");
    args.add(metricsFile.getAbsolutePath());
    runCommandLine(args.getArgsArray());
    Assert.assertTrue(outputFile.exists(), "Can't find expected MarkDuplicates output file at " + outputFile.getAbsolutePath());
    int totalReads = 0;
    int duplicateReads = 0;
    try (final ReadsDataSource outputReads = new ReadsDataSource(outputFile.toPath())) {
        for (GATKRead read : outputReads) {
            ++totalReads;
            if (read.isDuplicate()) {
                ++duplicateReads;
            }
        }
    }
    Assert.assertEquals(totalReads, totalExpected, "Wrong number of reads in output BAM");
    Assert.assertEquals(duplicateReads, dupsExpected, "Wrong number of duplicate reads in output BAM");
    final MetricsFile<DuplicationMetrics, Comparable<?>> metricsOutput = new MetricsFile<>();
    try {
        metricsOutput.read(new FileReader(metricsFile));
    } catch (final FileNotFoundException ex) {
        System.err.println("Metrics file not found: " + ex);
    }
    final List<DuplicationMetrics> nonEmptyMetrics = metricsOutput.getMetrics().stream().filter(metric -> metric.UNPAIRED_READS_EXAMINED != 0L || metric.READ_PAIRS_EXAMINED != 0L || metric.UNMAPPED_READS != 0L || metric.UNPAIRED_READ_DUPLICATES != 0L || metric.READ_PAIR_DUPLICATES != 0L || metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L)).collect(Collectors.toList());
    Assert.assertEquals(nonEmptyMetrics.size(), metricsExpected.size(), "Wrong number of metrics with non-zero fields.");
    for (int i = 0; i < nonEmptyMetrics.size(); i++) {
        final DuplicationMetrics observedMetrics = nonEmptyMetrics.get(i);
        List<?> expectedList = metricsExpected.get(observedMetrics.LIBRARY);
        Assert.assertNotNull(expectedList, "Unexpected library found: " + observedMetrics.LIBRARY);
        Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedList.get(0));
        Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedList.get(1));
        Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedList.get(2));
        Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedList.get(3));
        Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedList.get(4));
        Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedList.get(5));
        Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedList.get(6));
        //so we work around it by passing in an 'expected 0L' and only comparing to it if the actual value is non-null
        if (observedMetrics.ESTIMATED_LIBRARY_SIZE != null && (Long) expectedList.get(7) != 0L) {
            Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedList.get(7));
        }
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) MetricsFile(htsjdk.samtools.metrics.MetricsFile) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) MarkDuplicatesSparkTester(org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesSparkTester) MarkDuplicatesIntegrationTest(org.broadinstitute.hellbender.tools.picard.sam.markduplicates.MarkDuplicatesIntegrationTest) CommandLineProgram(org.broadinstitute.hellbender.cmdline.CommandLineProgram) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) DataProvider(org.testng.annotations.DataProvider) ImmutableMap(com.google.common.collect.ImmutableMap) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ArgumentsBuilder(org.broadinstitute.hellbender.utils.test.ArgumentsBuilder) Test(org.testng.annotations.Test) IOException(java.io.IOException) MetricsFile(htsjdk.samtools.metrics.MetricsFile) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) MarkDuplicatesSpark(org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark) AbstractMarkDuplicatesTester(org.broadinstitute.hellbender.utils.test.testers.AbstractMarkDuplicatesTester) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Assert(org.testng.Assert) Map(java.util.Map) FileReader(java.io.FileReader) AbstractMarkDuplicatesCommandLineProgramTest(org.broadinstitute.hellbender.utils.test.testers.AbstractMarkDuplicatesCommandLineProgramTest) DuplicationMetrics(org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics) FileNotFoundException(java.io.FileNotFoundException) ArgumentsBuilder(org.broadinstitute.hellbender.utils.test.ArgumentsBuilder) FileReader(java.io.FileReader) ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) MetricsFile(htsjdk.samtools.metrics.MetricsFile) File(java.io.File) MarkDuplicatesIntegrationTest(org.broadinstitute.hellbender.tools.picard.sam.markduplicates.MarkDuplicatesIntegrationTest) Test(org.testng.annotations.Test) AbstractMarkDuplicatesCommandLineProgramTest(org.broadinstitute.hellbender.utils.test.testers.AbstractMarkDuplicatesCommandLineProgramTest)

Aggregations

DuplicationMetrics (org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics)4 MetricsFile (htsjdk.samtools.metrics.MetricsFile)3 SAMRecord (htsjdk.samtools.SAMRecord)2 SamReader (htsjdk.samtools.SamReader)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 FileReader (java.io.FileReader)2 List (java.util.List)2 Map (java.util.Map)2 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)1 Histogram (htsjdk.samtools.util.Histogram)1 PeekableIterator (htsjdk.samtools.util.PeekableIterator)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Collectors (java.util.stream.Collectors)1 CommandLineProgram (org.broadinstitute.hellbender.cmdline.CommandLineProgram)1