use of htsjdk.samtools.filter.FilteringSamIterator in project gatk by broadinstitute.
the class AbstractAlignmentMerger method mergeAlignment.
/**
* /**
* Merges the alignment data with the non-aligned records from the source BAM file.
*/
public void mergeAlignment(final File referenceFasta) {
// Open the file of unmapped records and write the read groups to the the header for the merged file
final SamReader unmappedSam = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(this.unmappedBamFile);
final CloseableIterator<SAMRecord> unmappedIterator = unmappedSam.iterator();
this.header.setReadGroups(unmappedSam.getFileHeader().getReadGroups());
int aligned = 0;
int unmapped = 0;
// Get the aligned records and set up the first one
alignedIterator = new MultiHitAlignedReadIterator(new FilteringSamIterator(getQuerynameSortedAlignedRecords(), alignmentFilter), primaryAlignmentSelectionStrategy);
HitsForInsert nextAligned = nextAligned();
// sets the program group
if (getProgramRecord() != null) {
for (final SAMProgramRecord pg : unmappedSam.getFileHeader().getProgramRecords()) {
if (pg.getId().equals(getProgramRecord().getId())) {
throw new GATKException("Program Record ID already in use in unmapped BAM file.");
}
}
}
// Create the sorting collection that will write the records in the coordinate order
// to the final bam file
final SortingCollection<SAMRecord> sorted = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordCoordinateComparator(), MAX_RECORDS_IN_RAM);
while (unmappedIterator.hasNext()) {
// Load next unaligned read or read pair.
final SAMRecord rec = unmappedIterator.next();
rec.setHeaderStrict(this.header);
maybeSetPgTag(rec);
final SAMRecord secondOfPair;
if (rec.getReadPairedFlag()) {
secondOfPair = unmappedIterator.next();
secondOfPair.setHeaderStrict(this.header);
maybeSetPgTag(secondOfPair);
// Validate that paired reads arrive as first of pair followed by second of pair
if (!rec.getReadName().equals(secondOfPair.getReadName()))
throw new GATKException("Second read from pair not found in unmapped bam: " + rec.getReadName() + ", " + secondOfPair.getReadName());
if (!rec.getFirstOfPairFlag())
throw new GATKException("First record in unmapped bam is not first of pair: " + rec.getReadName());
if (!secondOfPair.getReadPairedFlag())
throw new GATKException("Second record in unmapped bam is not marked as paired: " + secondOfPair.getReadName());
if (!secondOfPair.getSecondOfPairFlag())
throw new GATKException("Second record in unmapped bam is not second of pair: " + secondOfPair.getReadName());
} else {
secondOfPair = null;
}
// See if there are alignments for current unaligned read or read pair.
if (nextAligned != null && rec.getReadName().equals(nextAligned.getReadName())) {
// If there are multiple alignments for a read (pair), then the unaligned SAMRecord must be cloned
// before copying info from the aligned record to the unaligned.
final boolean clone = nextAligned.numHits() > 1 || nextAligned.hasSupplementalHits();
SAMRecord r1Primary = null, r2Primary = null;
if (rec.getReadPairedFlag()) {
for (int i = 0; i < nextAligned.numHits(); ++i) {
// firstAligned or secondAligned may be null, if there wasn't an alignment for the end,
// or if the alignment was rejected by ignoreAlignment.
final SAMRecord firstAligned = nextAligned.getFirstOfPair(i);
final SAMRecord secondAligned = nextAligned.getSecondOfPair(i);
final boolean isPrimaryAlignment = (firstAligned != null && !firstAligned.isSecondaryOrSupplementary()) || (secondAligned != null && !secondAligned.isSecondaryOrSupplementary());
final SAMRecord firstToWrite;
final SAMRecord secondToWrite;
if (clone) {
firstToWrite = ReadUtils.cloneSAMRecord(rec);
secondToWrite = ReadUtils.cloneSAMRecord(secondOfPair);
} else {
firstToWrite = rec;
secondToWrite = secondOfPair;
}
// If these are the primary alignments then stash them for use on any supplemental alignments
if (isPrimaryAlignment) {
r1Primary = firstToWrite;
r2Primary = secondToWrite;
}
transferAlignmentInfoToPairedRead(firstToWrite, secondToWrite, firstAligned, secondAligned);
// Only write unmapped read when it has the mate info from the primary alignment.
if (!firstToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
addIfNotFiltered(sorted, firstToWrite);
if (firstToWrite.getReadUnmappedFlag())
++unmapped;
else
++aligned;
}
if (!secondToWrite.getReadUnmappedFlag() || isPrimaryAlignment) {
addIfNotFiltered(sorted, secondToWrite);
if (!secondToWrite.getReadUnmappedFlag())
++aligned;
else
++unmapped;
}
}
// Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
for (final boolean isRead1 : new boolean[] { true, false }) {
final List<SAMRecord> supplementals = isRead1 ? nextAligned.getSupplementalFirstOfPairOrFragment() : nextAligned.getSupplementalSecondOfPair();
final SAMRecord sourceRec = isRead1 ? rec : secondOfPair;
final SAMRecord matePrimary = isRead1 ? r2Primary : r1Primary;
for (final SAMRecord supp : supplementals) {
final SAMRecord out = ReadUtils.cloneSAMRecord(sourceRec);
transferAlignmentInfoToFragment(out, supp);
if (matePrimary != null)
SamPairUtil.setMateInformationOnSupplementalAlignment(out, matePrimary, addMateCigar);
++aligned;
addIfNotFiltered(sorted, out);
}
}
} else {
for (int i = 0; i < nextAligned.numHits(); ++i) {
final SAMRecord recToWrite = clone ? ReadUtils.cloneSAMRecord(rec) : rec;
transferAlignmentInfoToFragment(recToWrite, nextAligned.getFragment(i));
addIfNotFiltered(sorted, recToWrite);
if (recToWrite.getReadUnmappedFlag())
++unmapped;
else
++aligned;
}
// Take all of the supplemental reads which had been stashed and add them (as appropriate) to sorted
for (final SAMRecord supplementalRec : nextAligned.getSupplementalFirstOfPairOrFragment()) {
final SAMRecord recToWrite = ReadUtils.cloneSAMRecord(rec);
transferAlignmentInfoToFragment(recToWrite, supplementalRec);
addIfNotFiltered(sorted, recToWrite);
++aligned;
}
}
nextAligned = nextAligned();
} else {
// There was no alignment for this read or read pair.
if (nextAligned != null && SAMRecordQueryNameComparator.compareReadNames(rec.getReadName(), nextAligned.getReadName()) > 0) {
throw new IllegalStateException("Aligned record iterator (" + nextAligned.getReadName() + ") is behind the unmapped reads (" + rec.getReadName() + ")");
}
// No matching read from alignedIterator -- just output reads as is.
if (!alignedReadsOnly) {
sorted.add(rec);
++unmapped;
if (secondOfPair != null) {
sorted.add(secondOfPair);
++unmapped;
}
}
}
}
unmappedIterator.close();
Utils.validate(!alignedIterator.hasNext(), () -> "Reads remaining on alignment iterator: " + alignedIterator.next().getReadName() + "!");
alignedIterator.close();
// Write the records to the output file in specified sorted order,
header.setSortOrder(this.sortOrder);
final boolean presorted = this.sortOrder == SAMFileHeader.SortOrder.coordinate;
final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, presorted, this.targetBamFile, referenceFasta);
writer.setProgressLogger(new ProgressLogger(logger, (int) 1e7, "Wrote", "records from a sorting collection"));
final ProgressLogger finalProgress = new ProgressLogger(logger, 10000000, "Written in coordinate order to output", "records");
for (final SAMRecord rec : sorted) {
if (!rec.getReadUnmappedFlag()) {
if (refSeq != null) {
final byte[] referenceBases = refSeq.get(sequenceDictionary.getSequenceIndex(rec.getReferenceName())).getBases();
rec.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(rec, referenceBases, 0, bisulfiteSequence));
if (rec.getBaseQualities() != SAMRecord.NULL_QUALS) {
rec.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(rec, referenceBases, 0, bisulfiteSequence));
}
}
}
writer.addAlignment(rec);
finalProgress.record(rec);
}
writer.close();
sorted.cleanup();
CloserUtil.close(unmappedSam);
logger.info("Wrote " + aligned + " alignment records and " + (alignedReadsOnly ? 0 : unmapped) + " unmapped reads.");
}
use of htsjdk.samtools.filter.FilteringSamIterator in project gatk by broadinstitute.
the class RevertSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final boolean sanitizing = SANITIZE;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
// groups have the same values.
final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied.");
}
if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied.");
}
}
////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
////////////////////////////////////////////////////////////////////////////
final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
rg.setSample(SAMPLE_ALIAS);
}
if (LIBRARY_NAME != null) {
rg.setLibrary(LIBRARY_NAME);
}
outHeader.addReadGroup(rg);
}
outHeader.setSortOrder(SORT_ORDER);
if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
} else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary())
continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing)
sorter.add(rec);
else
out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
} else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
@Override
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag())
++unpaired;
if (rec.getFirstOfPairFlag())
++firsts;
if (rec.getSecondOfPairFlag())
++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
if (discarded / (double) total > MAX_DISCARD_FRACTION) {
throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
}
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.filter.FilteringSamIterator in project gatk by broadinstitute.
the class FilterReads method doWork.
@Override
protected Object doWork() {
try {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
if (WRITE_READS_FILES)
writeReadsFile(INPUT);
switch(FILTER) {
case includeAligned:
filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new AlignedFilter(true), true));
break;
case excludeAligned:
filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new AlignedFilter(false), true));
break;
case includeReadList:
filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new ReadNameFilter(READ_LIST_FILE, true)));
break;
case excludeReadList:
filterReads(new FilteringSamIterator(SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT).iterator(), new ReadNameFilter(READ_LIST_FILE, false)));
break;
default:
throw new UnsupportedOperationException(FILTER.name() + " has not been implemented!");
}
IOUtil.assertFileIsReadable(OUTPUT);
if (WRITE_READS_FILES)
writeReadsFile(OUTPUT);
} catch (IOException e) {
if (OUTPUT.exists() && !OUTPUT.delete()) {
throw new UserException("Failed to delete existing output: " + OUTPUT.getAbsolutePath());
} else {
throw new UserException("Failed to filter " + INPUT.getName());
}
}
return null;
}
Aggregations