use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class AbstractAlignmentMerger method mergeAlignment.
/**
* /**
* Merges the alignment data with the non-aligned records from the source BAM file.
*/
public void mergeAlignment(final File referenceFasta) {
// Open the file of unmapped records and write the read groups to the the header for the merged file
final SamReader unmappedSam = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(this.unmappedBamFile);
final CloseableIterator<SAMRecord> unmappedIterator = unmappedSam.iterator();
this.header.setReadGroups(unmappedSam.getFileHeader().getReadGroups());
int aligned = 0;
int unmapped = 0;
// Get the aligned records and set up the first one
alignedIterator = new MultiHitAlignedReadIterator(new FilteringSamIterator(getQuerynameSortedAlignedRecords(), alignmentFilter), primaryAlignmentSelectionStrategy);
HitsForInsert nextAligned = nextAligned();
// sets the program group
if (getProgramRecord() != null) {
for (final SAMProgramRecord pg : unmappedSam.getFileHeader().getProgramRecords()) {
if (pg.getId().equals(getProgramRecord().getId())) {
throw new GATKException("Program Record ID already in use in unmapped BAM file.");
}
}
}
// Create the sorting collection that will write the records in the coordinate order
// to the final bam file
final SortingCollection<SAMRecord> sorted = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator(), MAX_RECORDS_IN_RAM);
while (unmappedIterator.hasNext()) {
// Load next unaligned read or read pair.
final SAMRecord rec = unmappedIterator.next();
rec.setHeaderStrict(this.header);
maybeSetPgTag(rec);
final SAMRecord secondOfPair;
if (rec.getReadPairedFlag()) {
secondOfPair = unmappedIterator.next();
secondOfPair.setHeaderStrict(this.header);
maybeSetPgTag(secondOfPair);
// Validate that paired reads arrive as first of pair followed by second of pair
if (!rec.getReadName().equals(secondOfPair.getReadName()))
throw new GATKException("Second read from pair not found in unmapped bam: " + rec.getReadName() + ", " + secondOfPair.getReadName());
if (!rec.getFirstOfPairFlag())
throw new GATKException("First record in unmapped bam is not first of pair: " + rec.getReadName());
if (!secondOfPair.getReadPairedFlag())
throw new GATKException("Second record in unmapped bam is not marked as paired: " + secondOfPair.getReadName());
if (!secondOfPair.getSecondOfPairFlag())
throw new GATKException("Second record in unmapped bam is not second of pair: " + secondOfPair.getReadName());
} else {
secondOfPair = null;
}
// See if there are alignments for current unaligned read or read pair.
if (nextAligned != null && rec.getReadName().equals(nextAligned.getReadName())) {
// If there are multiple alignments for a read (pair), then the unaligned SAMRecord must be cloned
// before copying info from the aligned record to the unaligned.
final boolean clone = nextAligned.numHits() > 1 || nextAligned.hasSupplementalHits();
SAMRecord r1Primary = null, r2Primary = null;
if (rec.getReadPairedFlag()) {
for (int i = 0; i < nextAligned.numHits(); ++i) {
// firstAligned or secondAligned may be null, if there wasn't an alignment for the end,
// or if the alignment was rejected by ignoreAlignment.
final SAMRecord firstAligned = nextAligned.getFirstOfPair(i);
final SAMRecord secondAligned = nextAligned.getSecondOfPair(i);
final boolean isPrimaryAlignment = (firstAligned != null && !firstAligned.isSecondaryOrSupplementary()) || (secondAligned != null && !secondAligned.isSecondaryOrSupplementary());
final SAMRecord firstToWrite;
final SAMRecord secondToWrite;
if (clone) {
firstToWrite = ReadUtils.cloneSAMRecord(rec);
secondToWrite = ReadUtils.cloneSAMRecord(secondOfPair);
} else {
firstToWrite = rec;
secondToWrite = secondOfPair;
}
// If these are the primary alignments then stash them for use on any supplemental alignments
if (isPrimaryAlignment) {
r1Primary = firstToWrite;
r2Primary = secondToWrite;
}
transferAlignmentInfoToPairedRead(firstToWrite, secondToWrite, firstAligned, secondAligned);
// Only write unmapped read when it has the mate info from the primary alignment.
if (!firstToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
addIfNotFiltered(sorted, firstToWrite);
if (firstToWrite.getReadUnmappedFlag())
++unmapped;
else
++aligned;
}
if (!secondToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
addIfNotFiltered(sorted, secondToWrite);
if (!secondToWrite.getReadUnmappedFlag())
++aligned;
else
++unmapped;
}
}
// Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
for (final boolean isRead1 : new boolean[] { true, false }) {
final List<SAMRecord> supplementals = isRead1 ? nextAligned.getSupplementalFirstOfPairOrFragment() : nextAligned.getSupplementalSecondOfPair();
final SAMRecord sourceRec = isRead1 ? rec : secondOfPair;
final SAMRecord matePrimary = isRead1 ? r2Primary : r1Primary;
for (final SAMRecord supp : supplementals) {
final SAMRecord out = ReadUtils.cloneSAMRecord(sourceRec);
transferAlignmentInfoToFragment(out, supp);
if (matePrimary != null)
SamPairUtil.setMateInformationOnSupplementalAlignment(out, matePrimary, addMateCigar);
++aligned;
addIfNotFiltered(sorted, out);
}
}
} else {
for (int i = 0; i < nextAligned.numHits(); ++i) {
final SAMRecord recToWrite = clone ? ReadUtils.cloneSAMRecord(rec) : rec;
transferAlignmentInfoToFragment(recToWrite, nextAligned.getFragment(i));
addIfNotFiltered(sorted, recToWrite);
if (recToWrite.getReadUnmappedFlag())
++unmapped;
else
++aligned;
}
// Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
for (final SAMRecord supplementalRec : nextAligned.getSupplementalFirstOfPairOrFragment()) {
final SAMRecord recToWrite = ReadUtils.cloneSAMRecord(rec);
transferAlignmentInfoToFragment(recToWrite, supplementalRec);
addIfNotFiltered(sorted, recToWrite);
++aligned;
}
}
nextAligned = nextAligned();
} else {
// There was no alignment for this read or read pair.
if (nextAligned != null && SAMRecordQueryNameComparator.compareReadNames(rec.getReadName(), nextAligned.getReadName()) > 0) {
throw new IllegalStateException("Aligned record iterator (" + nextAligned.getReadName() + ") is behind the unmapped reads (" + rec.getReadName() + ")");
}
// No matching read from alignedIterator -- just output reads as is.
if (!alignedReadsOnly) {
sorted.add(rec);
++unmapped;
if (secondOfPair != null) {
sorted.add(secondOfPair);
++unmapped;
}
}
}
}
unmappedIterator.close();
Utils.validate(!alignedIterator.hasNext(), () -> "Reads remaining on alignment iterator: " + alignedIterator.next().getReadName() + "!");
alignedIterator.close();
// Write the records to the output file in specified sorted order,
header.setSortOrder(this.sortOrder);
final boolean presorted = this.sortOrder == SAMFileHeader.SortOrder.coordinate;
final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, presorted, this.targetBamFile, referenceFasta);
writer.setProgressLogger(new ProgressLogger(logger, (int) 1e7, "Wrote", "records from a sorting collection"));
final ProgressLogger finalProgress = new ProgressLogger(logger, 10000000, "Written in coordinate order to output", "records");
for (final SAMRecord rec : sorted) {
if (!rec.getReadUnmappedFlag()) {
if (refSeq != null) {
final byte[] referenceBases = refSeq.get(sequenceDictionary.getSequenceIndex(rec.getReferenceName())).getBases();
rec.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(rec, referenceBases, 0, bisulfiteSequence));
if (rec.getBaseQualities() != SAMRecord.NULL_QUALS) {
rec.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(rec, referenceBases, 0, bisulfiteSequence));
}
}
}
writer.addAlignment(rec);
finalProgress.record(rec);
}
writer.close();
sorted.cleanup();
CloserUtil.close(unmappedSam);
logger.info("Wrote " + aligned + " alignment records and " + (alignedReadsOnly ? 0 : unmapped) + " unmapped reads.");
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class SinglePassSamProgram method makeItSo.
public static void makeItSo(final File input, final File referenceSequence, final boolean assumeSorted, final long stopAfter, final Collection<SinglePassSamProgram> programs) {
// Setup the standard inputs
IOUtil.assertFileIsReadable(input);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(referenceSequence).open(input);
// Optionally load up the reference sequence and double check sequence dictionaries
final ReferenceSequenceFileWalker walker;
if (referenceSequence == null) {
walker = null;
} else {
IOUtil.assertFileIsReadable(referenceSequence);
walker = new ReferenceSequenceFileWalker(referenceSequence);
if (!in.getFileHeader().getSequenceDictionary().isEmpty()) {
SequenceUtil.assertSequenceDictionariesEqual(in.getFileHeader().getSequenceDictionary(), walker.getSequenceDictionary());
}
}
// Check on the sort order of the BAM file
{
final SortOrder sort = in.getFileHeader().getSortOrder();
if (sort != SortOrder.coordinate) {
if (assumeSorted) {
logger.warn("File reports sort order '" + sort + "', assuming it's coordinate sorted anyway.");
} else {
throw new UserException("File " + input.getAbsolutePath() + " should be coordinate sorted but " + "the header says the sort order is " + sort + ". If you believe the file " + "to be coordinate sorted you may pass ASSUME_SORTED=true");
}
}
}
// Call the abstract setup method!
boolean anyUseNoRefReads = false;
for (final SinglePassSamProgram program : programs) {
program.setup(in.getFileHeader(), input);
anyUseNoRefReads = anyUseNoRefReads || program.usesNoRefReads();
}
final ProgressLogger progress = new ProgressLogger(logger);
for (final SAMRecord rec : in) {
final ReferenceSequence ref;
if (walker == null || rec.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
ref = null;
} else {
ref = walker.get(rec.getReferenceIndex());
}
for (final SinglePassSamProgram program : programs) {
program.acceptRead(rec, ref);
}
progress.record(rec);
// See if we need to terminate early?
if (stopAfter > 0 && progress.getCount() >= stopAfter) {
break;
}
// And see if we're into the unmapped reads at the end
if (!anyUseNoRefReads && rec.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
break;
}
}
CloserUtil.close(in);
for (final SinglePassSamProgram program : programs) {
program.finish();
}
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class BedToIntervalList method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
IOUtil.assertFileIsWritable(OUTPUT);
try {
final SAMFileHeader header = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY);
final IntervalList intervalList = new IntervalList(header);
/**
* NB: BED is zero-based, but a BEDCodec by default (since it is returns tribble Features) has an offset of one,
* so it returns 1-based starts. Ugh. Set to zero.
*/
final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(BEDCodec.StartOffset.ZERO), false);
final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
final ProgressLogger progressLogger = new ProgressLogger(logger, (int) 1e6);
while (iterator.hasNext()) {
final BEDFeature bedFeature = iterator.next();
final String sequenceName = bedFeature.getContig();
/**
* NB: BED is zero-based, so we need to add one here to make it one-based. Please observe we set the start
* offset to zero when creating the BEDCodec.
*/
final int start = bedFeature.getStart() + 1;
/**
* NB: BED is 0-based OPEN (which, for the end is equivalent to 1-based closed).
*/
final int end = bedFeature.getEnd();
// NB: do not use an empty name within an interval
String name = bedFeature.getName();
if (name.isEmpty())
name = null;
final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);
// Do some validation
if (null == sequenceRecord) {
throw new GATKException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
} else if (start < 1) {
throw new GATKException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
} else if (sequenceRecord.getSequenceLength() < start) {
throw new GATKException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
} else if (end < 1) {
throw new GATKException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
} else if (sequenceRecord.getSequenceLength() < end) {
throw new GATKException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
} else if (end < start - 1) {
throw new GATKException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
}
final Interval interval = new Interval(sequenceName, start, end, bedFeature.getStrand() == Strand.POSITIVE, name);
intervalList.add(interval);
progressLogger.record(sequenceName, start);
}
CloserUtil.close(bedReader);
// Sort and write the output
intervalList.uniqued().write(OUTPUT);
} catch (final IOException e) {
throw new RuntimeException(e);
}
return null;
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class EstimateLibraryComplexity method doWork.
/**
* Method that does most of the work. Reads through the input BAM file and extracts the
* read sequences of each read pair and sorts them via a SortingCollection. Then traverses
* the sorted reads and looks at small groups at a time to find duplicates.
*/
@Override
protected Object doWork() {
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
logger.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
final int recordsRead = 0;
final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
for (final File f : INPUT) {
final Map<String, PairedReadSequence> pendingByName = new HashMap<>();
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readGroups.addAll(in.getFileHeader().getReadGroups());
for (final SAMRecord rec : in) {
if (!rec.getReadPairedFlag())
continue;
if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
continue;
}
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
prs = new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null)
prs.setReadGroup((short) readGroups.indexOf(rg));
}
pendingByName.put(rec.getReadName(), prs);
}
// Read passes quality check if both ends meet the mean quality criteria
final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY);
prs.qualityOk = prs.qualityOk && passesQualityCheck;
// Get the bases and restore them to their original orientation if necessary
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag())
SequenceUtil.reverseComplement(bases);
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
} else {
prs.read2 = bases;
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
sorter.add(prs);
}
progress.record(rec);
}
CloserUtil.close(in);
}
logger.info("Finished reading - moving on to scanning for duplicates.");
// Now go through the sorted reads and attempt to find duplicates
try (final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<>(sorter.iterator())) {
final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<>();
final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<>();
int groupsProcessed = 0;
long lastLogTime = System.currentTimeMillis();
final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4.0, (double) MIN_IDENTICAL_BASES * 2));
while (iterator.hasNext()) {
// Get the next group and split it apart by library
final List<PairedReadSequence> group = getNextGroup(iterator);
if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
final PairedReadSequence prs = group.get(0);
logger.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
} else {
final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
// Now process the reads by library
for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
final String library = entry.getKey();
final List<PairedReadSequence> seqs = entry.getValue();
Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
if (duplicationHisto == null) {
duplicationHisto = new Histogram<>("duplication_group_count", library);
opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
duplicationHistosByLibrary.put(library, duplicationHisto);
opticalHistosByLibrary.put(library, opticalHisto);
}
// Figure out if any reads within this group are duplicates of one another
for (int i = 0; i < seqs.size(); ++i) {
final PairedReadSequence lhs = seqs.get(i);
if (lhs == null)
continue;
final List<PairedReadSequence> dupes = new ArrayList<>();
for (int j = i + 1; j < seqs.size(); ++j) {
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null)
continue;
if (matches(lhs, rhs, MAX_DIFF_RATE)) {
dupes.add(rhs);
seqs.set(j, null);
}
}
if (!dupes.isEmpty()) {
dupes.add(lhs);
final int duplicateCount = dupes.size();
duplicationHisto.increment(duplicateCount);
final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
for (final boolean b : flags) {
if (b)
opticalHisto.increment(duplicateCount);
}
} else {
duplicationHisto.increment(1);
}
}
}
++groupsProcessed;
if (lastLogTime < System.currentTimeMillis() - 60000) {
logger.info("Processed " + groupsProcessed + " groups.");
lastLogTime = System.currentTimeMillis();
}
}
}
sorter.cleanup();
final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
for (final String library : duplicationHistosByLibrary.keySet()) {
final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
final DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
// Filter out any bins that have only a single entry in them and calcu
for (final Integer bin : duplicationHisto.keySet()) {
final double duplicateGroups = duplicationHisto.get(bin).getValue();
final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
if (duplicateGroups > 1) {
metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
}
}
metrics.calculateDerivedMetrics();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
}
file.write(OUTPUT);
}
return null;
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class ReplaceSamHeader method standardReheader.
private void standardReheader(final SAMFileHeader replacementHeader) {
final SamReader recordReader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(ValidationStringency.SILENT).open(INPUT);
if (replacementHeader.getSortOrder() != recordReader.getFileHeader().getSortOrder()) {
throw new UserException("Sort orders of INPUT (" + recordReader.getFileHeader().getSortOrder().name() + ") and HEADER (" + replacementHeader.getSortOrder().name() + ") do not agree.");
}
try (final SAMFileWriter writer = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, replacementHeader, true)) {
final ProgressLogger progress = new ProgressLogger(logger);
for (final SAMRecord rec : recordReader) {
rec.setHeaderStrict(replacementHeader);
writer.addAlignment(rec);
progress.record(rec);
}
}
CloserUtil.close(recordReader);
}
Aggregations