use of htsjdk.samtools.SAMRecord in project gatk by broadinstitute.
the class CollectRnaSeqMetricsTest method testMultiLevel.
@Test
public void testMultiLevel() throws Exception {
final String sequence = "chr1";
final String ignoredSequence = "chrM";
// Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic.
final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate, false);
// Set seed so that strandedness is consistent among runs.
builder.setRandomSeed(0);
final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);
final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("2");
rg1.setSample("Sample");
rg1.setLibrary("foo");
builder.setReadGroup(rg1);
builder.addPair("pair1", sequenceIndex, 45, 475);
builder.addPair("pair2", sequenceIndex, 90, 225);
builder.addFrag("frag1", sequenceIndex, 150, true);
builder.addFrag("frag2", sequenceIndex, 450, true);
final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("3");
rg2.setSample("Sample");
rg2.setLibrary("bar");
builder.setReadGroup(rg2);
builder.addPair("pair3", sequenceIndex, 120, 600);
builder.addFrag("frag3", sequenceIndex, 225, false);
builder.addPair("rrnaPair", sequenceIndex, 400, 500);
builder.addFrag("ignoredFrag", builder.getHeader().getSequenceIndex(ignoredSequence), 1, false);
final File samFile = BaseTest.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
try (final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile)) {
for (final SAMRecord rec : builder.getRecords()) samWriter.addAlignment(rec);
}
// Create an interval list with one ribosomal interval.
final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA");
final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
rRnaIntervalList.add(rRnaInterval);
final File rRnaIntervalsFile = BaseTest.createTempFile("tmp.rRna.", ".interval_list");
rRnaIntervalList.write(rRnaIntervalsFile);
// Generate the metrics.
final File metricsFile = BaseTest.createTempFile("tmp.", ".rna_metrics");
final String[] args = new String[] { "--input", samFile.getAbsolutePath(), "--output", metricsFile.getAbsolutePath(), "--REF_FLAT", getRefFlatFile(sequence).getAbsolutePath(), "--RIBOSOMAL_INTERVALS", rRnaIntervalsFile.getAbsolutePath(), "--STRAND_SPECIFICITY", "SECOND_READ_TRANSCRIPTION_STRAND", "--IGNORE_SEQUENCE", ignoredSequence, "--LEVEL", "SAMPLE", "--LEVEL", "LIBRARY" };
runCommandLine(args);
final MetricsFile<RnaSeqMetrics, Comparable<?>> output = new MetricsFile<>();
output.read(new FileReader(metricsFile));
for (final RnaSeqMetrics metrics : output.getMetrics()) {
if (metrics.LIBRARY == null) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 396);
Assert.assertEquals(metrics.PF_BASES, 432);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 108L);
Assert.assertEquals(metrics.CODING_BASES, 136);
Assert.assertEquals(metrics.UTR_BASES, 51);
Assert.assertEquals(metrics.INTRONIC_BASES, 50);
Assert.assertEquals(metrics.INTERGENIC_BASES, 51);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
Assert.assertEquals(metrics.IGNORED_READS, 1);
} else if (metrics.LIBRARY.equals("foo")) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 216);
Assert.assertEquals(metrics.PF_BASES, 216);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 36L);
Assert.assertEquals(metrics.CODING_BASES, 89);
Assert.assertEquals(metrics.UTR_BASES, 51);
Assert.assertEquals(metrics.INTRONIC_BASES, 25);
Assert.assertEquals(metrics.INTERGENIC_BASES, 15);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
Assert.assertEquals(metrics.IGNORED_READS, 0);
} else if (metrics.LIBRARY.equals("bar")) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 180);
Assert.assertEquals(metrics.PF_BASES, 216);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 72L);
Assert.assertEquals(metrics.CODING_BASES, 47);
Assert.assertEquals(metrics.UTR_BASES, 0);
Assert.assertEquals(metrics.INTRONIC_BASES, 25);
Assert.assertEquals(metrics.INTERGENIC_BASES, 36);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 0);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
Assert.assertEquals(metrics.IGNORED_READS, 1);
}
}
}
use of htsjdk.samtools.SAMRecord in project gatk by broadinstitute.
the class EstimateLibraryComplexity method doWork.
/**
* Method that does most of the work. Reads through the input BAM file and extracts the
* read sequences of each read pair and sorts them via a SortingCollection. Then traverses
* the sorted reads and looks at small groups at a time to find duplicates.
*/
@Override
protected Object doWork() {
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
final int recordsRead = 0;
final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
for (final File f : INPUT) {
final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readGroups.addAll(in.getFileHeader().getReadGroups());
for (final SAMRecord rec : in) {
if (!rec.getReadPairedFlag())
continue;
if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
continue;
}
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
prs = new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null)
prs.setReadGroup((short) readGroups.indexOf(rg));
}
pendingByName.put(rec.getReadName(), prs);
}
// Read passes quality check if both ends meet the mean quality criteria
final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
prs.qualityOk = prs.qualityOk && passesQualityCheck;
// Get the bases and restore them to their original orientation if necessary
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag())
SequenceUtil.reverseComplement(bases);
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
} else {
prs.read2 = bases;
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
sorter.add(prs);
}
progress.record(rec);
}
CloserUtil.close(in);
}
logger.info("Finished reading - moving on to scanning for duplicates.");
// Now go through the sorted reads and attempt to find duplicates
try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
int groupsProcessed = 0;
long lastLogTime = System.currentTimeMillis();
final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
while (iterator.hasNext()) {
// Get the next group and split it apart by library
final List<PairedReadSequence> group = getNextGroup(iterator);
if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
final PairedReadSequence prs = group.get(0);
logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
} else {
final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
// Now process the reads by library
for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
final String library = entry.getKey();
final List<PairedReadSequence> seqs = entry.getValue();
Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
if (duplicationHisto == null) {
duplicationHisto = new Histogram<>("duplication_group_count", library);
opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
duplicationHistosByLibrary.put(library, duplicationHisto);
opticalHistosByLibrary.put(library, opticalHisto);
}
// Figure out if any reads within this group are duplicates of one another
for (int i = 0; i < seqs.size(); ++i) {
final PairedReadSequence lhs = seqs.get(i);
if (lhs == null)
continue;
final List<PairedReadSequence> dupes = new ArrayList<>();
for (int j = i + 1; j < seqs.size(); ++j) {
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null)
continue;
if (matches(lhs, rhs, MAX_DIFF_RATE)) {
dupes.add(rhs);
seqs.set(j, null);
}
}
if (!dupes.isEmpty()) {
dupes.add(lhs);
final int duplicateCount = dupes.size();
duplicationHisto.increment(duplicateCount);
final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
for (final boolean b : flags) {
if (b)
opticalHisto.increment(duplicateCount);
}
} else {
duplicationHisto.increment(1);
}
}
}
++groupsProcessed;
if (lastLogTime < System.currentTimeMillis() - 60000) {
logger.info("Processed " + groupsProcessed + " groups.");
lastLogTime = System.currentTimeMillis();
}
}
}
sorter.cleanup();
final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
for (final String library : duplicationHistosByLibrary.keySet()) {
final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
final DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
// Filter out any bins that have only a single entry in them and calcu
for (final Integer bin : duplicationHisto.keySet()) {
final double duplicateGroups = duplicationHisto.get(bin).getValue();
final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
if (duplicateGroups > 1) {
metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
}
}
metrics.calculateDerivedMetrics();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
}
file.write(OUTPUT);
}
return null;
}
use of htsjdk.samtools.SAMRecord in project gatk by broadinstitute.
the class ReplaceSamHeader method standardReheader.
private void standardReheader(final SAMFileHeader replacementHeader) {
final SamReader recordReader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(ValidationStringency.SILENT).open(INPUT);
if (replacementHeader.getSortOrder() != recordReader.getFileHeader().getSortOrder()) {
throw new UserException("Sort orders of INPUT (" + recordReader.getFileHeader().getSortOrder().name() + ") and HEADER (" + replacementHeader.getSortOrder().name() + ") do not agree.");
}
try (final SAMFileWriter writer = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, replacementHeader, true)) {
final ProgressLogger progress = new ProgressLogger(logger);
for (final SAMRecord rec : recordReader) {
rec.setHeaderStrict(replacementHeader);
writer.addAlignment(rec);
progress.record(rec);
}
}
CloserUtil.close(recordReader);
}
use of htsjdk.samtools.SAMRecord in project gatk by broadinstitute.
the class RevertOriginalBaseQualitiesAndAddMateCigar method doWork.
@Override
public Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
boolean foundPairedMappedReads = false;
// Check if we can skip this file since it does not have OQ tags and the mate cigar tag is already there.
final CanSkipSamFile skipSamFile = RevertOriginalBaseQualitiesAndAddMateCigar.canSkipSAMFile(INPUT, MAX_RECORDS_TO_EXAMINE, RESTORE_ORIGINAL_QUALITIES, REFERENCE_SEQUENCE);
logger.info(skipSamFile.getMessage(MAX_RECORDS_TO_EXAMINE));
if (skipSamFile.canSkip())
return null;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).enable(SamReaderFactory.Option.EAGERLY_DECODE).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// Build the output writer based on the correct sort order
final SAMFileHeader outHeader = ReadUtils.cloneSAMFileHeader(inHeader);
// same as the input
if (null == SORT_ORDER)
this.SORT_ORDER = inHeader.getSortOrder();
outHeader.setSortOrder(SORT_ORDER);
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, false)) {
// Iterate over the records, revert original base qualities, and push them into a SortingCollection by queryname
final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
final ProgressLogger revertingProgress = new ProgressLogger(logger, 1000000, " reverted OQs");
int numOriginalQualitiesRestored = 0;
for (final SAMRecord record : in) {
// Clean up reads that map off the end of the reference
AbstractAlignmentMerger.createNewCigarsIfMapsOffEndOfReference(record);
if (RESTORE_ORIGINAL_QUALITIES && null != record.getOriginalBaseQualities()) {
// revert the original base qualities
record.setBaseQualities(record.getOriginalBaseQualities());
record.setOriginalBaseQualities(null);
numOriginalQualitiesRestored++;
}
if (!foundPairedMappedReads && record.getReadPairedFlag() && !record.getReadUnmappedFlag())
foundPairedMappedReads = true;
revertingProgress.record(record);
sorter.add(record);
}
CloserUtil.close(in);
logger.info("Reverted the original base qualities for " + numOriginalQualitiesRestored + " records");
/**
* Iterator through sorting collection output
* 1. Set mate cigar string and mate information
* 2. push record into SAMFileWriter to the output
*/
try (final SamPairUtil.SetMateInfoIterator sorterIterator = new SamPairUtil.SetMateInfoIterator(sorter.iterator(), true)) {
final ProgressLogger sorterProgress = new ProgressLogger(logger, 1000000, " mate cigars added");
while (sorterIterator.hasNext()) {
final SAMRecord record = sorterIterator.next();
out.addAlignment(record);
sorterProgress.record(record);
}
CloserUtil.close(out);
logger.info("Updated " + sorterIterator.getNumMateCigarsAdded() + " records with mate cigar");
if (!foundPairedMappedReads)
logger.info("Did not find any paired mapped reads.");
}
}
return null;
}
use of htsjdk.samtools.SAMRecord in project gatk by broadinstitute.
the class RevertOriginalBaseQualitiesAndAddMateCigar method canSkipSAMFile.
/**
* Checks if we can skip the SAM/BAM file when reverting origin base qualities and adding mate cigars.
*
* @param inputFile the SAM/BAM input file
* @param maxRecordsToExamine the maximum number of records to examine before quitting
* @param revertOriginalBaseQualities true if we are to revert original base qualities, false otherwise
* @return whether we can skip or not, and the explanation why.
*/
public static CanSkipSamFile canSkipSAMFile(final File inputFile, final int maxRecordsToExamine, boolean revertOriginalBaseQualities, final File referenceFasta) {
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).enable(SamReaderFactory.Option.EAGERLY_DECODE).open(inputFile);
final Iterator<SAMRecord> iterator = in.iterator();
int numRecordsExamined = 0;
CanSkipSamFile returnType = CanSkipSamFile.FOUND_NO_EVIDENCE;
while (iterator.hasNext() && numRecordsExamined < maxRecordsToExamine) {
final SAMRecord record = iterator.next();
if (revertOriginalBaseQualities && null != record.getOriginalBaseQualities()) {
// has OQ, break and return case #2
returnType = CanSkipSamFile.CANNOT_SKIP_FOUND_OQ;
break;
}
// check if mate pair and its mate is mapped
if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
if (null == SAMUtils.getMateCigar(record)) {
// has no MC, break and return case #2
returnType = CanSkipSamFile.CANNOT_SKIP_FOUND_NO_MC;
break;
} else {
// has MC, previously checked that it does not have OQ, break and return case #1
returnType = CanSkipSamFile.CAN_SKIP;
break;
}
}
numRecordsExamined++;
}
// no more records anyhow, so we can skip
if (!iterator.hasNext() && CanSkipSamFile.FOUND_NO_EVIDENCE == returnType) {
returnType = CanSkipSamFile.CAN_SKIP;
}
CloserUtil.close(in);
return returnType;
}
Aggregations