use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class MarkDuplicates method doWork.
/**
* Main work method. Reads the BAM file once and collects sorted information about
* the 5' ends of both ends of each read (or just one end in the case of pairs).
* Then makes a pass through those determining duplicates before re-reading the
* input file and writing it out with duplication flags set correctly.
*/
@Override
protected Object doWork() {
IOUtil.assertFilesAreReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(METRICS_FILE);
reportMemoryStats("Start of doWork");
logger.info("Reading input file and constructing read end information.");
buildSortedReadEndLists();
reportMemoryStats("After buildSortedReadEndLists");
generateDuplicateIndexes();
reportMemoryStats("After generateDuplicateIndexes");
logger.info("Marking " + this.numDuplicateIndices + " records as duplicates.");
if (this.opticalDuplicatesArgumentCollection.READ_NAME_REGEX == null) {
logger.warn("Skipped optical duplicate cluster discovery; library size estimation may be inaccurate!");
} else {
logger.info("Found " + (this.libraryIdGenerator.getNumberOfOpticalDuplicateClusters()) + " optical duplicate clusters.");
}
try (final SamHeaderAndIterator headerAndIterator = openInputs()) {
final SAMFileHeader header = headerAndIterator.header;
final SAMFileHeader outputHeader = ReadUtils.cloneSAMFileHeader(header);
outputHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
for (final String comment : COMMENT) outputHeader.addComment(comment);
// Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it.
final Map<String, String> chainedPgIds = getChainedPgIds(outputHeader);
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outputHeader, true)) {
// Now copy over the file while marking all the necessary indexes as duplicates
long recordInFileIndex = 0;
long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : -1);
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e7, "Written");
try (final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator) {
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
if (!rec.isSecondaryOrSupplementary()) {
final String library = LibraryIdGenerator.getLibraryName(header, rec);
DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library);
if (metrics == null) {
metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
libraryIdGenerator.addMetricsByLibrary(library, metrics);
}
// First bring the simple metrics up to date
if (rec.getReadUnmappedFlag()) {
++metrics.UNMAPPED_READS;
} else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
// will need to be divided by 2 at the end
++metrics.READ_PAIRS_EXAMINED;
}
if (recordInFileIndex == nextDuplicateIndex) {
rec.setDuplicateReadFlag(true);
// Update the duplication metrics
if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
++metrics.UNPAIRED_READ_DUPLICATES;
} else {
// will need to be divided by 2 at the end
++metrics.READ_PAIR_DUPLICATES;
}
// Now try and figure out the next duplicate index
if (this.duplicateIndexes.hasNext()) {
nextDuplicateIndex = this.duplicateIndexes.next();
} else {
// Only happens once we've marked all the duplicates
nextDuplicateIndex = -1;
}
} else {
rec.setDuplicateReadFlag(false);
}
}
recordInFileIndex++;
if (!this.REMOVE_DUPLICATES || !rec.getDuplicateReadFlag()) {
if (PROGRAM_RECORD_ID != null) {
rec.setAttribute(SAMTag.PG.name(), chainedPgIds.get(rec.getStringAttribute(SAMTag.PG.name())));
}
out.addAlignment(rec);
progress.record(rec);
}
}
}
this.duplicateIndexes.cleanup();
reportMemoryStats("Before output close");
}
reportMemoryStats("After output close");
}
// Write out the metrics
finalizeAndWriteMetrics(libraryIdGenerator);
return null;
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class MarkDuplicates method buildSortedReadEndLists.
/**
* Goes through all the records in a file and generates a set of ReadEndsForMarkDuplicates objects that
* hold the necessary information (reference sequence, 5' read coordinate) to do
* duplication, caching to disk as necessary to sort them.
*/
private void buildSortedReadEndLists() {
final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / ReadEndsForMarkDuplicates.SIZE_OF);
logger.info("Will retain up to " + maxInMemory + " data points before spilling to disk.");
this.pairSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class, new ReadEndsForMarkDuplicatesCodec(), new ReadEndsMDComparator(), maxInMemory, TMP_DIR);
this.fragSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class, new ReadEndsForMarkDuplicatesCodec(), new ReadEndsMDComparator(), maxInMemory, TMP_DIR);
try (final SamHeaderAndIterator headerAndIterator = openInputs()) {
final SAMFileHeader header = headerAndIterator.header;
final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP);
long index = 0;
final ProgressLogger progress = new ProgressLogger(logger, (int) 1e6, "Read");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
if (null == this.libraryIdGenerator) {
this.libraryIdGenerator = new LibraryIdGenerator(header);
}
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
// over the input
if (PROGRAM_RECORD_ID != null) {
// Gather all PG IDs seen in merged input files in first pass. These are gathered for two reasons:
// - to know how many different PG records to create to represent this program invocation.
// - to know what PG IDs are already used to avoid collisions when creating new ones.
// Note that if there are one or more records that do not have a PG tag, then a null value
// will be stored in this set.
pgIdsSeen.add(rec.getStringAttribute(SAMTag.PG.name()));
}
if (rec.getReadUnmappedFlag()) {
if (rec.getReferenceIndex() == -1) {
// When we hit the unmapped reads with no coordinate, no reason to continue.
break;
}
// If this read is unmapped but sorted with the mapped reads, just skip it.
} else if (!rec.isSecondaryOrSupplementary()) {
final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, index, rec);
this.fragSort.add(fragmentEnd);
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
final String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName();
ReadEndsForMarkDuplicates pairedEnds = tmp.remove(rec.getReferenceIndex(), key);
// See if we've already seen the first end or not
if (pairedEnds == null) {
pairedEnds = buildReadEnds(header, index, rec);
tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds);
} else {
final int sequence = fragmentEnd.read1ReferenceIndex;
final int coordinate = fragmentEnd.read1Coordinate;
// before updating the orientation later.
if (rec.getFirstOfPairFlag()) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
}
// If the second read is actually later, just add the second read data, else flip the reads
if (sequence > pairedEnds.read1ReferenceIndex || (sequence == pairedEnds.read1ReferenceIndex && coordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = sequence;
pairedEnds.read2Coordinate = coordinate;
pairedEnds.read2IndexInFile = index;
pairedEnds.orientation = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = sequence;
pairedEnds.read1Coordinate = coordinate;
pairedEnds.read1IndexInFile = index;
pairedEnds.orientation = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
}
pairedEnds.score += DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);
this.pairSort.add(pairedEnds);
}
}
}
// Print out some stats every 1m reads
++index;
if (progress.record(rec)) {
logger.info("Tracking " + tmp.size() + " as yet unmatched pairs. " + tmp.sizeInRam() + " records in RAM.");
}
}
logger.info("Read " + index + " records. " + tmp.size() + " pairs never matched.");
iterator.close();
}
// Tell these collections to free up memory if possible.
this.pairSort.doneAdding();
this.fragSort.doneAdding();
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class GatherVcfs method gatherConventionally.
/** Code for gathering multiple VCFs that works regardless of input format and output format, but can be slow. */
private static void gatherConventionally(final SAMSequenceDictionary sequenceDictionary, final boolean createIndex, final List<Path> inputFiles, final File outputFile, final int cloudPrefetchBuffer) {
final EnumSet<Options> options = EnumSet.copyOf(VariantContextWriterBuilder.DEFAULT_OPTIONS);
if (createIndex)
options.add(Options.INDEX_ON_THE_FLY);
else
options.remove(Options.INDEX_ON_THE_FLY);
try (final VariantContextWriter out = new VariantContextWriterBuilder().setOutputFile(outputFile).setReferenceDictionary(sequenceDictionary).setOptions(options).build()) {
final ProgressLogger progress = new ProgressLogger(log, 10000);
VariantContext lastContext = null;
Path lastFile = null;
VCFHeader firstHeader = null;
VariantContextComparator comparator = null;
for (final Path f : inputFiles) {
try {
log.debug("Gathering from file: ", f.toUri().toString());
final FeatureReader<VariantContext> variantReader = getReaderFromVCFUri(f, cloudPrefetchBuffer);
final PeekableIterator<VariantContext> variantIterator;
variantIterator = new PeekableIterator<>(variantReader.iterator());
final VCFHeader header = (VCFHeader) variantReader.getHeader();
if (firstHeader == null) {
firstHeader = header;
out.writeHeader(firstHeader);
comparator = new VariantContextComparator(firstHeader.getContigLines());
}
if (lastContext != null && variantIterator.hasNext()) {
final VariantContext vc = variantIterator.peek();
if (comparator.compare(vc, lastContext) <= 0) {
throw new IllegalStateException("First variant in file " + f.toUri().toString() + " is at " + vc.getSource() + " but last variant in earlier file " + lastFile.toUri().toString() + " is at " + lastContext.getSource());
}
}
while (variantIterator.hasNext()) {
lastContext = variantIterator.next();
out.add(lastContext);
progress.record(lastContext.getContig(), lastContext.getStart());
}
lastFile = f;
CloserUtil.close(variantIterator);
CloserUtil.close(variantReader);
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(f, e.getMessage(), e);
}
}
}
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class LiftOverVcf method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsReadable(CHAIN);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(REJECT);
////////////////////////////////////////////////////////////////////////
// Setup the inputs
////////////////////////////////////////////////////////////////////////
final LiftOver liftOver = new LiftOver(CHAIN);
final VCFFileReader in = new VCFFileReader(INPUT, false);
logger.info("Loading up the target reference genome.");
final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final Map<String, byte[]> refSeqs = new HashMap<>();
for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
}
CloserUtil.close(walker);
////////////////////////////////////////////////////////////////////////
// Setup the outputs
////////////////////////////////////////////////////////////////////////
final VCFHeader inHeader = in.getFileHeader();
final VCFHeader outHeader = new VCFHeader(inHeader);
outHeader.setSequenceDictionary(walker.getSequenceDictionary());
final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY).setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build();
out.writeHeader(outHeader);
final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY).build();
final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
rejects.writeHeader(rejectHeader);
////////////////////////////////////////////////////////////////////////
// Read the input VCF, lift the records over and write to the sorting
// collection.
////////////////////////////////////////////////////////////////////////
long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
logger.info("Lifting variants over and sorting.");
final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");
for (final VariantContext ctx : in) {
++total;
final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
final Interval target = liftOver.liftOver(source, 1.0);
if (target == null) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
failedLiftover++;
} else {
// Fix the alleles if we went from positive to negative strand
final List<Allele> alleles = new ArrayList<>();
for (final Allele oldAllele : ctx.getAlleles()) {
if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
alleles.add(oldAllele);
} else {
alleles.add(Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()));
}
}
// Build the new variant context
final VariantContextBuilder builder = new VariantContextBuilder(ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);
builder.id(ctx.getID());
builder.attributes(ctx.getAttributes());
builder.genotypes(ctx.getGenotypes());
builder.filters(ctx.getFilters());
builder.log10PError(ctx.getLog10PError());
// Check that the reference allele still agrees with the reference sequence
boolean mismatchesReference = false;
for (final Allele allele : builder.getAlleles()) {
if (allele.isReference()) {
final byte[] ref = refSeqs.get(target.getContig());
final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length());
if (!refString.equalsIgnoreCase(allele.getBaseString())) {
mismatchesReference = true;
}
break;
}
}
if (mismatchesReference) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
failedAlleleCheck++;
} else {
sorter.add(builder.make());
}
}
progress.record(ctx.getContig(), ctx.getStart());
}
final NumberFormat pfmt = new DecimalFormat("0.0000%");
final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
logger.info("Processed ", total, " variants.");
logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
logger.info(Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over.");
logger.info(pct, " of variants were not successfully lifted over and written to the output.");
rejects.close();
in.close();
////////////////////////////////////////////////////////////////////////
// Write the sorted outputs to the final output file
////////////////////////////////////////////////////////////////////////
sorter.doneAdding();
progress = new ProgressLogger(logger, 1000000, "written");
logger.info("Writing out sorted records to final VCF.");
for (final VariantContext ctx : sorter) {
out.add(ctx);
progress.record(ctx.getContig(), ctx.getStart());
}
out.close();
sorter.cleanup();
return null;
}
use of org.broadinstitute.hellbender.utils.runtime.ProgressLogger in project gatk by broadinstitute.
the class SplitVcfs method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
final ProgressLogger progress = new ProgressLogger(logger, 10000);
final VCFFileReader fileReader = new VCFFileReader(INPUT);
final VCFHeader fileHeader = fileReader.getFileHeader();
final SAMSequenceDictionary sequenceDictionary = SEQUENCE_DICTIONARY != null ? SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY).getSequenceDictionary() : fileHeader.getSequenceDictionary();
if (CREATE_INDEX && sequenceDictionary == null) {
throw new UserException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
}
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder().setReferenceDictionary(sequenceDictionary).clearOptions();
if (CREATE_INDEX)
builder.setOption(Options.INDEX_ON_THE_FLY);
try (final VariantContextWriter snpWriter = builder.setOutputFile(SNP_OUTPUT).build();
final VariantContextWriter indelWriter = builder.setOutputFile(INDEL_OUTPUT).build()) {
snpWriter.writeHeader(fileHeader);
indelWriter.writeHeader(fileHeader);
int incorrectVariantCount = 0;
final CloseableIterator<VariantContext> iterator = fileReader.iterator();
while (iterator.hasNext()) {
final VariantContext context = iterator.next();
if (context.isIndel())
indelWriter.add(context);
else if (context.isSNP())
snpWriter.add(context);
else {
if (STRICT)
throw new IllegalStateException("Found a record with type " + context.getType().name());
else
incorrectVariantCount++;
}
progress.record(context.getContig(), context.getStart());
}
if (incorrectVariantCount > 0) {
logger.debug("Found " + incorrectVariantCount + " records that didn't match SNP or INDEL");
}
CloserUtil.close(iterator);
CloserUtil.close(fileReader);
}
return null;
}
Aggregations