use of htsjdk.samtools.SAMFileHeader in project gatk by broadinstitute.
the class RevertOriginalBaseQualitiesAndAddMateCigar method doWork.
@Override
public Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
boolean foundPairedMappedReads = false;
// Check if we can skip this file since it does not have OQ tags and the mate cigar tag is already there.
final CanSkipSamFile skipSamFile = RevertOriginalBaseQualitiesAndAddMateCigar.canSkipSAMFile(INPUT, MAX_RECORDS_TO_EXAMINE, RESTORE_ORIGINAL_QUALITIES, REFERENCE_SEQUENCE);
logger.info(skipSamFile.getMessage(MAX_RECORDS_TO_EXAMINE));
if (skipSamFile.canSkip())
return null;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).enable(SamReaderFactory.Option.EAGERLY_DECODE).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// Build the output writer based on the correct sort order
final SAMFileHeader outHeader = ReadUtils.cloneSAMFileHeader(inHeader);
// same as the input
if (null == SORT_ORDER)
this.SORT_ORDER = inHeader.getSortOrder();
outHeader.setSortOrder(SORT_ORDER);
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, false)) {
// Iterate over the records, revert original base qualities, and push them into a SortingCollection by queryname
final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
final ProgressLogger revertingProgress = new ProgressLogger(logger, 1000000, " reverted OQs");
int numOriginalQualitiesRestored = 0;
for (final SAMRecord record : in) {
// Clean up reads that map off the end of the reference
AbstractAlignmentMerger.createNewCigarsIfMapsOffEndOfReference(record);
if (RESTORE_ORIGINAL_QUALITIES && null != record.getOriginalBaseQualities()) {
// revert the original base qualities
record.setBaseQualities(record.getOriginalBaseQualities());
record.setOriginalBaseQualities(null);
numOriginalQualitiesRestored++;
}
if (!foundPairedMappedReads && record.getReadPairedFlag() && !record.getReadUnmappedFlag())
foundPairedMappedReads = true;
revertingProgress.record(record);
sorter.add(record);
}
CloserUtil.close(in);
logger.info("Reverted the original base qualities for " + numOriginalQualitiesRestored + " records");
/**
* Iterator through sorting collection output
* 1. Set mate cigar string and mate information
* 2. push record into SAMFileWriter to the output
*/
try (final SamPairUtil.SetMateInfoIterator sorterIterator = new SamPairUtil.SetMateInfoIterator(sorter.iterator(), true)) {
final ProgressLogger sorterProgress = new ProgressLogger(logger, 1000000, " mate cigars added");
while (sorterIterator.hasNext()) {
final SAMRecord record = sorterIterator.next();
out.addAlignment(record);
sorterProgress.record(record);
}
CloserUtil.close(out);
logger.info("Updated " + sorterIterator.getNumMateCigarsAdded() + " records with mate cigar");
if (!foundPairedMappedReads)
logger.info("Did not find any paired mapped reads.");
}
}
return null;
}
use of htsjdk.samtools.SAMFileHeader in project gatk by broadinstitute.
the class RevertSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final boolean sanitizing = SANITIZE;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
// groups have the same values.
final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied.");
}
if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied.");
}
}
////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
////////////////////////////////////////////////////////////////////////////
final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
rg.setSample(SAMPLE_ALIAS);
}
if (LIBRARY_NAME != null) {
rg.setLibrary(LIBRARY_NAME);
}
outHeader.addReadGroup(rg);
}
outHeader.setSortOrder(SORT_ORDER);
if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
} else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary())
continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing)
sorter.add(rec);
else
out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
} else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
@Override
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag())
++unpaired;
if (rec.getFirstOfPairFlag())
++firsts;
if (rec.getSecondOfPairFlag())
++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
if (discarded / (double) total > MAX_DISCARD_FRACTION) {
throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
}
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.SAMFileHeader in project gatk by broadinstitute.
the class FixMateInformation method doWork.
@Override
protected Object doWork() {
// Open up the input
boolean allQueryNameSorted = true;
final List<SamReader> readers = new ArrayList<>();
for (final File f : INPUT) {
IOUtil.assertFileIsReadable(f);
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readers.add(reader);
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname)
allQueryNameSorted = false;
}
// or into a temporary file that will overwrite the INPUT file eventually
if (OUTPUT != null)
OUTPUT = OUTPUT.getAbsoluteFile();
final boolean differentOutputSpecified = OUTPUT != null;
if (differentOutputSpecified) {
IOUtil.assertFileIsWritable(OUTPUT);
} else if (INPUT.size() != 1) {
throw new UserException("Must specify either an explicit OUTPUT file or a single INPUT file to be overridden.");
} else {
final File soleInput = INPUT.get(0).getAbsoluteFile();
final File dir = soleInput.getParentFile().getAbsoluteFile();
try {
IOUtil.assertFileIsWritable(soleInput);
IOUtil.assertDirectoryIsWritable(dir);
OUTPUT = File.createTempFile(soleInput.getName() + ".being_fixed.", BamFileIoUtils.BAM_FILE_EXTENSION, dir);
} catch (final IOException ioe) {
throw new RuntimeIOException("Could not create tmp file in " + dir.getAbsolutePath());
}
}
// Get the input records merged and sorted by query name as needed
final PeekableIterator<SAMRecord> iterator;
final SAMFileHeader header;
{
// Deal with merging if necessary
final Iterator<SAMRecord> tmp;
if (INPUT.size() > 1) {
final List<SAMFileHeader> headers = new ArrayList<>(readers.size());
for (final SamReader reader : readers) {
headers.add(reader.getFileHeader());
}
final SAMFileHeader.SortOrder sortOrder = (allQueryNameSorted ? SAMFileHeader.SortOrder.queryname : SAMFileHeader.SortOrder.unsorted);
final SamFileHeaderMerger merger = new SamFileHeaderMerger(sortOrder, headers, false);
tmp = new MergingSamRecordIterator(merger, readers, false);
header = merger.getMergedHeader();
} else {
tmp = readers.get(0).iterator();
header = readers.get(0).getFileHeader();
}
// And now deal with re-sorting if necessary
if (ASSUME_SORTED || allQueryNameSorted) {
iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<>(tmp), ADD_MATE_CIGAR);
} else {
logger.info("Sorting input into queryname order.");
final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
while (tmp.hasNext()) {
sorter.add(tmp.next());
}
iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<SAMRecord>(sorter.iterator()) {
@Override
public void close() {
super.close();
sorter.cleanup();
}
}, ADD_MATE_CIGAR);
logger.info("Sorting by queryname complete.");
}
// Deal with the various sorting complications
final SAMFileHeader.SortOrder outputSortOrder = SORT_ORDER == null ? readers.get(0).getFileHeader().getSortOrder() : SORT_ORDER;
logger.info("Output will be sorted by " + outputSortOrder);
header.setSortOrder(outputSortOrder);
}
if (CREATE_INDEX && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException("Can't CREATE_INDEX unless sort order is coordinate");
}
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, header, header.getSortOrder() == SAMFileHeader.SortOrder.queryname)) {
logger.info("Traversing query name sorted records and fixing up mate pair information.");
final ProgressLogger progress = new ProgressLogger(logger);
while (iterator.hasNext()) {
final SAMRecord record = iterator.next();
out.addAlignment(record);
progress.record(record);
}
iterator.close();
if (header.getSortOrder() == SAMFileHeader.SortOrder.queryname) {
logger.info("Closing output file.");
} else {
logger.info("Finished processing reads; re-sorting output file.");
}
}
// TODO throw appropriate exceptions instead of writing to log.error and returning
if (!differentOutputSpecified) {
logger.info("Replacing input file with fixed file.");
final File soleInput = INPUT.get(0).getAbsoluteFile();
final File old = new File(soleInput.getParentFile(), soleInput.getName() + ".old");
if (!old.exists() && soleInput.renameTo(old)) {
if (OUTPUT.renameTo(soleInput)) {
if (!old.delete()) {
logger.warn("Could not delete old file: " + old.getAbsolutePath());
return null;
}
if (CREATE_INDEX) {
final File newIndex = new File(OUTPUT.getParent(), OUTPUT.getName().substring(0, OUTPUT.getName().length() - 4) + ".bai");
final File oldIndex = new File(soleInput.getParent(), soleInput.getName().substring(0, soleInput.getName().length() - 4) + ".bai");
if (!newIndex.renameTo(oldIndex)) {
logger.warn("Could not overwrite index file: " + oldIndex.getAbsolutePath());
}
}
} else {
logger.error("Could not move new file to " + soleInput.getAbsolutePath());
logger.error("Input file preserved as: " + old.getAbsolutePath());
logger.error("New file preserved as: " + OUTPUT.getAbsolutePath());
return null;
}
} else {
logger.error("Could not move input file out of the way: " + soleInput.getAbsolutePath());
if (!OUTPUT.delete()) {
logger.error("Could not delete temporary file: " + OUTPUT.getAbsolutePath());
}
return null;
}
}
CloserUtil.close(readers);
return null;
}
use of htsjdk.samtools.SAMFileHeader in project gatk by broadinstitute.
the class ReorderSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsWritable(OUTPUT);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
SAMSequenceDictionary refDict = reference.getSequenceDictionary();
if (refDict == null) {
CloserUtil.close(in);
throw new UserException("No reference sequence dictionary found. Aborting. " + "You can create a sequence dictionary for the reference fasta using CreateSequenceDictionary.jar.");
}
printDictionary("SAM/BAM file", in.getFileHeader().getSequenceDictionary());
printDictionary("Reference", refDict);
Map<Integer, Integer> newOrder = buildSequenceDictionaryMap(refDict, in.getFileHeader().getSequenceDictionary());
// has to be after we create the newOrder
SAMFileHeader outHeader = ReadUtils.cloneSAMFileHeader(in.getFileHeader());
outHeader.setSequenceDictionary(refDict);
logger.info("Writing reads...");
if (in.hasIndex()) {
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, true)) {
// write the reads in contig order
for (final SAMSequenceRecord contig : refDict.getSequences()) {
final SAMRecordIterator it = in.query(contig.getSequenceName(), 0, 0, false);
writeReads(out, it, newOrder, contig.getSequenceName());
}
// don't forget the unmapped reads
writeReads(out, in.queryUnmapped(), newOrder, "unmapped");
}
} else {
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, false)) {
writeReads(out, in.iterator(), newOrder, "All reads");
}
}
// cleanup
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.SAMFileHeader in project gatk by broadinstitute.
the class MarkDuplicatesSparkUnitTest method markDupesTest.
@Test(dataProvider = "md", groups = "spark")
public void markDupesTest(final String input, final long totalExpected, final long dupsExpected) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
ReadsSparkSource readSource = new ReadsSparkSource(ctx);
JavaRDD<GATKRead> reads = readSource.getParallelReads(input, null);
Assert.assertEquals(reads.count(), totalExpected);
SAMFileHeader header = readSource.getHeader(input, null);
OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection();
final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1);
Assert.assertEquals(markedReads.count(), totalExpected);
JavaRDD<GATKRead> dupes = markedReads.filter(GATKRead::isDuplicate);
Assert.assertEquals(dupes.count(), dupsExpected);
}
Aggregations