use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class EstimateLibraryComplexity method doWork.
/**
* Method that does most of the work. Reads through the input BAM file and extracts the
* read sequences of each read pair and sorts them via a SortingCollection. Then traverses
* the sorted reads and looks at small groups at a time to find duplicates.
*/
@Override
protected Object doWork() {
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
final int recordsRead = 0;
final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
for (final File f : INPUT) {
final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readGroups.addAll(in.getFileHeader().getReadGroups());
for (final SAMRecord rec : in) {
if (!rec.getReadPairedFlag())
continue;
if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
continue;
}
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
prs = new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null)
prs.setReadGroup((short) readGroups.indexOf(rg));
}
pendingByName.put(rec.getReadName(), prs);
}
// Read passes quality check if both ends meet the mean quality criteria
final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
prs.qualityOk = prs.qualityOk && passesQualityCheck;
// Get the bases and restore them to their original orientation if necessary
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag())
SequenceUtil.reverseComplement(bases);
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
} else {
prs.read2 = bases;
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
sorter.add(prs);
}
progress.record(rec);
}
CloserUtil.close(in);
}
logger.info("Finished reading - moving on to scanning for duplicates.");
// Now go through the sorted reads and attempt to find duplicates
try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
int groupsProcessed = 0;
long lastLogTime = System.currentTimeMillis();
final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
while (iterator.hasNext()) {
// Get the next group and split it apart by library
final List<PairedReadSequence> group = getNextGroup(iterator);
if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
final PairedReadSequence prs = group.get(0);
logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
} else {
final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
// Now process the reads by library
for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
final String library = entry.getKey();
final List<PairedReadSequence> seqs = entry.getValue();
Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
if (duplicationHisto == null) {
duplicationHisto = new Histogram<>("duplication_group_count", library);
opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
duplicationHistosByLibrary.put(library, duplicationHisto);
opticalHistosByLibrary.put(library, opticalHisto);
}
// Figure out if any reads within this group are duplicates of one another
for (int i = 0; i < seqs.size(); ++i) {
final PairedReadSequence lhs = seqs.get(i);
if (lhs == null)
continue;
final List<PairedReadSequence> dupes = new ArrayList<>();
for (int j = i + 1; j < seqs.size(); ++j) {
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null)
continue;
if (matches(lhs, rhs, MAX_DIFF_RATE)) {
dupes.add(rhs);
seqs.set(j, null);
}
}
if (!dupes.isEmpty()) {
dupes.add(lhs);
final int duplicateCount = dupes.size();
duplicationHisto.increment(duplicateCount);
final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
for (final boolean b : flags) {
if (b)
opticalHisto.increment(duplicateCount);
}
} else {
duplicationHisto.increment(1);
}
}
}
++groupsProcessed;
if (lastLogTime < System.currentTimeMillis() - 60000) {
logger.info("Processed " + groupsProcessed + " groups.");
lastLogTime = System.currentTimeMillis();
}
}
}
sorter.cleanup();
final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
for (final String library : duplicationHistosByLibrary.keySet()) {
final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
final DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
// Filter out any bins that have only a single entry in them and calcu
for (final Integer bin : duplicationHisto.keySet()) {
final double duplicateGroups = duplicationHisto.get(bin).getValue();
final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
if (duplicateGroups > 1) {
metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
}
}
metrics.calculateDerivedMetrics();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
}
file.write(OUTPUT);
}
return null;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class ReplaceSamHeader method standardReheader.
private void standardReheader(final SAMFileHeader replacementHeader) {
final SamReader recordReader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(ValidationStringency.SILENT).open(INPUT);
if (replacementHeader.getSortOrder() != recordReader.getFileHeader().getSortOrder()) {
throw new UserException("Sort orders of INPUT (" + recordReader.getFileHeader().getSortOrder().name() + ") and HEADER (" + replacementHeader.getSortOrder().name() + ") do not agree.");
}
try (final SAMFileWriter writer = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, replacementHeader, true)) {
final ProgressLogger progress = new ProgressLogger(logger);
for (final SAMRecord rec : recordReader) {
rec.setHeaderStrict(replacementHeader);
writer.addAlignment(rec);
progress.record(rec);
}
}
CloserUtil.close(recordReader);
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class RevertOriginalBaseQualitiesAndAddMateCigar method doWork.
@Override
public Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
boolean foundPairedMappedReads = false;
// Check if we can skip this file since it does not have OQ tags and the mate cigar tag is already there.
final CanSkipSamFile skipSamFile = RevertOriginalBaseQualitiesAndAddMateCigar.canSkipSAMFile(INPUT, MAX_RECORDS_TO_EXAMINE, RESTORE_ORIGINAL_QUALITIES, REFERENCE_SEQUENCE);
logger.info(skipSamFile.getMessage(MAX_RECORDS_TO_EXAMINE));
if (skipSamFile.canSkip())
return null;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).enable(SamReaderFactory.Option.EAGERLY_DECODE).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// Build the output writer based on the correct sort order
final SAMFileHeader outHeader = ReadUtils.cloneSAMFileHeader(inHeader);
// same as the input
if (null == SORT_ORDER)
this.SORT_ORDER = inHeader.getSortOrder();
outHeader.setSortOrder(SORT_ORDER);
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, false)) {
// Iterate over the records, revert original base qualities, and push them into a SortingCollection by queryname
final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
final ProgressLogger revertingProgress = new ProgressLogger(logger, 1000000, " reverted OQs");
int numOriginalQualitiesRestored = 0;
for (final SAMRecord record : in) {
// Clean up reads that map off the end of the reference
AbstractAlignmentMerger.createNewCigarsIfMapsOffEndOfReference(record);
if (RESTORE_ORIGINAL_QUALITIES && null != record.getOriginalBaseQualities()) {
// revert the original base qualities
record.setBaseQualities(record.getOriginalBaseQualities());
record.setOriginalBaseQualities(null);
numOriginalQualitiesRestored++;
}
if (!foundPairedMappedReads && record.getReadPairedFlag() && !record.getReadUnmappedFlag())
foundPairedMappedReads = true;
revertingProgress.record(record);
sorter.add(record);
}
CloserUtil.close(in);
logger.info("Reverted the original base qualities for " + numOriginalQualitiesRestored + " records");
/**
* Iterator through sorting collection output
* 1. Set mate cigar string and mate information
* 2. push record into SAMFileWriter to the output
*/
try (final SamPairUtil.SetMateInfoIterator sorterIterator = new SamPairUtil.SetMateInfoIterator(sorter.iterator(), true)) {
final ProgressLogger sorterProgress = new ProgressLogger(logger, 1000000, " mate cigars added");
while (sorterIterator.hasNext()) {
final SAMRecord record = sorterIterator.next();
out.addAlignment(record);
sorterProgress.record(record);
}
CloserUtil.close(out);
logger.info("Updated " + sorterIterator.getNumMateCigarsAdded() + " records with mate cigar");
if (!foundPairedMappedReads)
logger.info("Did not find any paired mapped reads.");
}
}
return null;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class RevertOriginalBaseQualitiesAndAddMateCigar method canSkipSAMFile.
/**
* Checks if we can skip the SAM/BAM file when reverting origin base qualities and adding mate cigars.
*
* @param inputFile the SAM/BAM input file
* @param maxRecordsToExamine the maximum number of records to examine before quitting
* @param revertOriginalBaseQualities true if we are to revert original base qualities, false otherwise
* @return whether we can skip or not, and the explanation why.
*/
public static CanSkipSamFile canSkipSAMFile(final File inputFile, final int maxRecordsToExamine, boolean revertOriginalBaseQualities, final File referenceFasta) {
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).enable(SamReaderFactory.Option.EAGERLY_DECODE).open(inputFile);
final Iterator<SAMRecord> iterator = in.iterator();
int numRecordsExamined = 0;
CanSkipSamFile returnType = CanSkipSamFile.FOUND_NO_EVIDENCE;
while (iterator.hasNext() && numRecordsExamined < maxRecordsToExamine) {
final SAMRecord record = iterator.next();
if (revertOriginalBaseQualities && null != record.getOriginalBaseQualities()) {
// has OQ, break and return case #2
returnType = CanSkipSamFile.CANNOT_SKIP_FOUND_OQ;
break;
}
// check if mate pair and its mate is mapped
if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
if (null == SAMUtils.getMateCigar(record)) {
// has no MC, break and return case #2
returnType = CanSkipSamFile.CANNOT_SKIP_FOUND_NO_MC;
break;
} else {
// has MC, previously checked that it does not have OQ, break and return case #1
returnType = CanSkipSamFile.CAN_SKIP;
break;
}
}
numRecordsExamined++;
}
// no more records anyhow, so we can skip
if (!iterator.hasNext() && CanSkipSamFile.FOUND_NO_EVIDENCE == returnType) {
returnType = CanSkipSamFile.CAN_SKIP;
}
CloserUtil.close(in);
return returnType;
}
use of htsjdk.samtools.SamReader in project gatk by broadinstitute.
the class RevertSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final boolean sanitizing = SANITIZE;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
// groups have the same values.
final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied.");
}
if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied.");
}
}
////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
////////////////////////////////////////////////////////////////////////////
final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
rg.setSample(SAMPLE_ALIAS);
}
if (LIBRARY_NAME != null) {
rg.setLibrary(LIBRARY_NAME);
}
outHeader.addReadGroup(rg);
}
outHeader.setSortOrder(SORT_ORDER);
if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
} else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary())
continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing)
sorter.add(rec);
else
out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
} else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
@Override
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag())
++unpaired;
if (rec.getFirstOfPairFlag())
++firsts;
if (rec.getSecondOfPairFlag())
++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
if (discarded / (double) total > MAX_DISCARD_FRACTION) {
throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
}
CloserUtil.close(in);
return null;
}
Aggregations