use of htsjdk.samtools.metrics.MetricsFile in project gatk by broadinstitute.
the class CollectJumpingLibraryMetrics method doWork.
/**
* Calculates the detailed statistics about the jumping library and then generates the results.
*/
@Override
protected Object doWork() {
for (File f : INPUT) {
IOUtil.assertFileIsReadable(f);
}
IOUtil.assertFileIsWritable(OUTPUT);
Histogram<Integer> innieHistogram = new Histogram<>();
Histogram<Integer> outieHistogram = new Histogram<>();
int fragments = 0;
int innies = 0;
int outies = 0;
int innieDupes = 0;
int outieDupes = 0;
int crossChromPairs = 0;
int superSized = 0;
int tandemPairs = 0;
double chimeraSizeMinimum = Math.max(getOutieMode(), (double) CHIMERA_KB_MIN);
for (File f : INPUT) {
SamReader reader = SamReaderFactory.makeDefault().open(f);
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException("SAM file must " + f.getName() + " must be sorted in coordinate order");
}
for (SAMRecord sam : reader) {
// We're getting all our info from the first of each pair.
if (!sam.getFirstOfPairFlag()) {
continue;
}
// Ignore unmapped read pairs
if (sam.getReadUnmappedFlag()) {
if (!sam.getMateUnmappedFlag()) {
fragments++;
continue;
}
// If both ends are unmapped and we've hit unaligned reads we're done
if (sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
break;
}
continue;
}
if (sam.getMateUnmappedFlag()) {
fragments++;
continue;
}
// Ignore low-quality reads. If we don't have the mate mapping quality, assume it's OK
if ((sam.getAttribute(SAMTag.MQ.name()) != null && sam.getIntegerAttribute(SAMTag.MQ.name()) < MINIMUM_MAPPING_QUALITY) || sam.getMappingQuality() < MINIMUM_MAPPING_QUALITY) {
continue;
}
final int absInsertSize = Math.abs(sam.getInferredInsertSize());
if (absInsertSize > chimeraSizeMinimum) {
superSized++;
} else if (sam.getMateNegativeStrandFlag() == sam.getReadNegativeStrandFlag()) {
tandemPairs++;
} else if (!sam.getMateReferenceIndex().equals(sam.getReferenceIndex())) {
crossChromPairs++;
} else {
final PairOrientation pairOrientation = SamPairUtil.getPairOrientation(sam);
if (pairOrientation == PairOrientation.RF) {
outieHistogram.increment(absInsertSize);
outies++;
if (sam.getDuplicateReadFlag()) {
outieDupes++;
}
} else if (pairOrientation == PairOrientation.FR) {
innieHistogram.increment(absInsertSize);
innies++;
if (sam.getDuplicateReadFlag()) {
innieDupes++;
}
} else {
throw new IllegalStateException("This should never happen");
}
}
}
CloserUtil.close(reader);
}
MetricsFile<JumpingLibraryMetrics, Integer> metricsFile = getMetricsFile();
JumpingLibraryMetrics metrics = new JumpingLibraryMetrics();
metrics.JUMP_PAIRS = outies;
metrics.JUMP_DUPLICATE_PAIRS = outieDupes;
metrics.JUMP_DUPLICATE_PCT = outies != 0 ? outieDupes / (double) outies : 0;
metrics.JUMP_LIBRARY_SIZE = (outies > 0 && outieDupes > 0) ? DuplicationMetrics.estimateLibrarySize(outies, outies - outieDupes) : 0;
outieHistogram.trimByTailLimit(TAIL_LIMIT);
metrics.JUMP_MEAN_INSERT_SIZE = outieHistogram.getMean();
metrics.JUMP_STDEV_INSERT_SIZE = outieHistogram.getStandardDeviation();
metrics.NONJUMP_PAIRS = innies;
metrics.NONJUMP_DUPLICATE_PAIRS = innieDupes;
metrics.NONJUMP_DUPLICATE_PCT = innies != 0 ? innieDupes / (double) innies : 0;
metrics.NONJUMP_LIBRARY_SIZE = (innies > 0 && innieDupes > 0) ? DuplicationMetrics.estimateLibrarySize(innies, innies - innieDupes) : 0;
innieHistogram.trimByTailLimit(TAIL_LIMIT);
metrics.NONJUMP_MEAN_INSERT_SIZE = innieHistogram.getMean();
metrics.NONJUMP_STDEV_INSERT_SIZE = innieHistogram.getStandardDeviation();
metrics.CHIMERIC_PAIRS = crossChromPairs + superSized + tandemPairs;
metrics.FRAGMENTS = fragments;
double totalPairs = outies + innies + metrics.CHIMERIC_PAIRS;
metrics.PCT_JUMPS = totalPairs != 0 ? outies / totalPairs : 0;
metrics.PCT_NONJUMPS = totalPairs != 0 ? innies / totalPairs : 0;
metrics.PCT_CHIMERAS = totalPairs != 0 ? metrics.CHIMERIC_PAIRS / totalPairs : 0;
metricsFile.addMetric(metrics);
metricsFile.write(OUTPUT);
return null;
}
use of htsjdk.samtools.metrics.MetricsFile in project gatk by broadinstitute.
the class MeanQualityByCycleSparkIntegrationTest method test.
@Test(dataProvider = "filenames", groups = { "R", "spark" })
public void test(final String inputFile, final String referenceName) throws IOException {
final File input = new File(TEST_DATA_DIR, inputFile);
final File expectedFile = new File(TEST_DATA_DIR, "meanqualbycycle.txt");
final File outfile = BaseTest.createTempFile("testMeanQualityByCycle", ".metrics");
final File pdf = BaseTest.createTempFile("testMeanQualityByCycle", ".pdf");
outfile.deleteOnExit();
pdf.deleteOnExit();
final ArgumentsBuilder args = new ArgumentsBuilder();
args.add("--input");
args.add(input.getAbsolutePath());
args.add("--output");
args.add(outfile.getAbsolutePath());
args.add("--chart");
args.add(pdf.getAbsolutePath());
if (null != referenceName) {
final File REF = new File(referenceName);
args.add("-R");
args.add(REF.getAbsolutePath());
}
runCommandLine(args.getArgsArray());
try (final FileReader actualReader = new FileReader(outfile)) {
final MetricsFile<?, Integer> output = new MetricsFile<>();
output.read(actualReader);
Assert.assertEquals(output.getAllHistograms().size(), 1);
Assert.assertEquals(output.getHistogram().size(), 202);
}
Assert.assertTrue(pdf.exists(), "exists");
Assert.assertTrue(pdf.length() > 0, "length");
IntegrationTestSpec.assertEqualTextFiles(outfile, expectedFile, "#");
}
use of htsjdk.samtools.metrics.MetricsFile in project gatk by broadinstitute.
the class EstimateLibraryComplexity method doWork.
/**
* Method that does most of the work. Reads through the input BAM file and extracts the
* read sequences of each read pair and sorts them via a SortingCollection. Then traverses
* the sorted reads and looks at small groups at a time to find duplicates.
*/
@Override
protected Object doWork() {
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
final int recordsRead = 0;
final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
for (final File f : INPUT) {
final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readGroups.addAll(in.getFileHeader().getReadGroups());
for (final SAMRecord rec : in) {
if (!rec.getReadPairedFlag())
continue;
if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
continue;
}
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
prs = new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null)
prs.setReadGroup((short) readGroups.indexOf(rg));
}
pendingByName.put(rec.getReadName(), prs);
}
// Read passes quality check if both ends meet the mean quality criteria
final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
prs.qualityOk = prs.qualityOk && passesQualityCheck;
// Get the bases and restore them to their original orientation if necessary
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag())
SequenceUtil.reverseComplement(bases);
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
} else {
prs.read2 = bases;
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
sorter.add(prs);
}
progress.record(rec);
}
CloserUtil.close(in);
}
logger.info("Finished reading - moving on to scanning for duplicates.");
// Now go through the sorted reads and attempt to find duplicates
try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
int groupsProcessed = 0;
long lastLogTime = System.currentTimeMillis();
final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
while (iterator.hasNext()) {
// Get the next group and split it apart by library
final List<PairedReadSequence> group = getNextGroup(iterator);
if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
final PairedReadSequence prs = group.get(0);
logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
} else {
final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
// Now process the reads by library
for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
final String library = entry.getKey();
final List<PairedReadSequence> seqs = entry.getValue();
Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
if (duplicationHisto == null) {
duplicationHisto = new Histogram<>("duplication_group_count", library);
opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
duplicationHistosByLibrary.put(library, duplicationHisto);
opticalHistosByLibrary.put(library, opticalHisto);
}
// Figure out if any reads within this group are duplicates of one another
for (int i = 0; i < seqs.size(); ++i) {
final PairedReadSequence lhs = seqs.get(i);
if (lhs == null)
continue;
final List<PairedReadSequence> dupes = new ArrayList<>();
for (int j = i + 1; j < seqs.size(); ++j) {
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null)
continue;
if (matches(lhs, rhs, MAX_DIFF_RATE)) {
dupes.add(rhs);
seqs.set(j, null);
}
}
if (!dupes.isEmpty()) {
dupes.add(lhs);
final int duplicateCount = dupes.size();
duplicationHisto.increment(duplicateCount);
final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
for (final boolean b : flags) {
if (b)
opticalHisto.increment(duplicateCount);
}
} else {
duplicationHisto.increment(1);
}
}
}
++groupsProcessed;
if (lastLogTime < System.currentTimeMillis() - 60000) {
logger.info("Processed " + groupsProcessed + " groups.");
lastLogTime = System.currentTimeMillis();
}
}
}
sorter.cleanup();
final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
for (final String library : duplicationHistosByLibrary.keySet()) {
final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
final DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
// Filter out any bins that have only a single entry in them and calcu
for (final Integer bin : duplicationHisto.keySet()) {
final double duplicateGroups = duplicationHisto.get(bin).getValue();
final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
if (duplicateGroups > 1) {
metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
}
}
metrics.calculateDerivedMetrics();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
}
file.write(OUTPUT);
}
return null;
}
use of htsjdk.samtools.metrics.MetricsFile in project gatk by broadinstitute.
the class AbstractMarkDuplicatesTester method test.
@Override
public void test() {
try {
updateExpectedDuplicationMetrics();
// Read the output and check the duplicate flag
int outputRecords = 0;
final SamReader reader = SamReaderFactory.makeDefault().open(getOutput());
for (final SAMRecord record : reader) {
outputRecords++;
final String key = samRecordToDuplicatesFlagsKey(record);
if (!this.duplicateFlags.containsKey(key)) {
System.err.println("DOES NOT CONTAIN KEY: " + key);
}
Assert.assertTrue(this.duplicateFlags.containsKey(key));
final boolean value = this.duplicateFlags.get(key);
this.duplicateFlags.remove(key);
if (value != record.getDuplicateReadFlag()) {
System.err.println("Mismatching read:");
System.err.print(record.getSAMString());
}
Assert.assertEquals(record.getDuplicateReadFlag(), value);
}
CloserUtil.close(reader);
// Ensure the program output the same number of records as were read in
Assert.assertEquals(outputRecords, this.getNumberOfRecords(), ("saw " + outputRecords + " output records, vs. " + this.getNumberOfRecords() + " input records"));
// Check the values written to metrics.txt against our input expectations
final MetricsFile<DuplicationMetrics, Comparable<?>> metricsOutput = new MetricsFile<>();
try {
metricsOutput.read(new FileReader(metricsFile));
} catch (final FileNotFoundException ex) {
System.err.println("Metrics file not found: " + ex);
}
// NB: Test writes an initial metrics line with a null entry for LIBRARY and 0 values for all metrics. So we find the first non-null one
final DuplicationMetrics observedMetrics = metricsOutput.getMetrics().stream().filter(metric -> metric.LIBRARY != null).findFirst().get();
Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedMetrics.UNPAIRED_READS_EXAMINED, "UNPAIRED_READS_EXAMINED does not match expected");
Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedMetrics.READ_PAIRS_EXAMINED, "READ_PAIRS_EXAMINED does not match expected");
Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedMetrics.UNMAPPED_READS, "UNMAPPED_READS does not match expected");
Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedMetrics.UNPAIRED_READ_DUPLICATES, "UNPAIRED_READ_DUPLICATES does not match expected");
Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedMetrics.READ_PAIR_DUPLICATES, "READ_PAIR_DUPLICATES does not match expected");
Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match expected");
Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedMetrics.PERCENT_DUPLICATION, "PERCENT_DUPLICATION does not match expected");
// coded and needs to have real values for each field.
if (observedMetrics.ESTIMATED_LIBRARY_SIZE == null) {
observedMetrics.ESTIMATED_LIBRARY_SIZE = 0L;
}
if (expectedMetrics.ESTIMATED_LIBRARY_SIZE == null) {
expectedMetrics.ESTIMATED_LIBRARY_SIZE = 0L;
}
Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedMetrics.ESTIMATED_LIBRARY_SIZE, "ESTIMATED_LIBRARY_SIZE does not match expected");
} finally {
TestUtil.recursiveDelete(getOutputDir());
}
}
use of htsjdk.samtools.metrics.MetricsFile in project gridss by PapenfussLab.
the class CollectMapqMetricsTest method loadHistogram.
public static Histogram<Integer> loadHistogram(File file) throws FileNotFoundException, IOException {
try (FileReader reader = new FileReader(file)) {
MetricsFile<MapqMetrics, Integer> metricsFile = new MetricsFile<MapqMetrics, Integer>();
metricsFile.read(reader);
return metricsFile.getHistogram();
}
}
Aggregations