use of htsjdk.samtools.filter.SamRecordFilter in project gatk by broadinstitute.
the class AllelicCountCollector method collect.
/**
* Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list)
* in a sorted BAM file. Reads and bases below the specified mapping quality and base quality, respectively,
* are filtered out of the pileup. The alt count is defined as the total count minus the ref count, and the
* alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the
* bases in {@link AllelicCountCollector#BASES}.
* @param bamFile sorted BAM file
* @param siteIntervals interval list of sites
* @param minMappingQuality minimum mapping quality required for reads to be included in pileup
* @param minBaseQuality minimum base quality required for bases to be included in pileup
* @return AllelicCountCollection of ref/alt counts at sites in BAM file
*/
public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) {
try (final SamReader reader = readerFactory.open(bamFile)) {
ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative.");
ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative.");
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final int numberOfSites = siteIntervals.size();
final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX;
final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(true);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + numberOfSites + " sites in total...");
int locusCount = 0;
final AllelicCountCollection counts = new AllelicCountCollection();
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " sites.");
}
locusCount++;
final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!BASES.contains(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString()));
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum();
final int refReadCount = (int) baseCounts.get(refBase);
//we take alt = total - ref instead of the actual alt count
final int altReadCount = totalBaseCount - refReadCount;
final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase);
counts.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase));
}
logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined.");
return counts;
} catch (final IOException | SAMFormatException e) {
throw new UserException("Unable to collect allelic counts from " + bamFile);
}
}
use of htsjdk.samtools.filter.SamRecordFilter in project gatk by broadinstitute.
the class CollectOxoGMetrics method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
if (INTERVALS != null)
IOUtil.assertFileIsReadable(INTERVALS);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SamReader in = SamReaderFactory.makeDefault().open(INPUT);
final Set<String> samples = new HashSet<>();
final Set<String> libraries = new HashSet<>();
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY));
}
// Setup the calculators
final Set<String> contexts = CONTEXTS.isEmpty() ? makeContextStrings(CONTEXT_SIZE) : CONTEXTS;
final ListMap<String, Calculator> calculators = new ListMap<>();
for (final String context : contexts) {
for (final String library : libraries) {
calculators.add(context, new Calculator(library, context));
}
}
// Load up dbSNP if available
logger.info("Loading dbSNP File: " + DB_SNP);
final DbSnpBitSetUtil dbSnp;
if (DB_SNP != null)
dbSnp = new DbSnpBitSetUtil(DB_SNP, in.getFileHeader().getSequenceDictionary());
else
dbSnp = null;
// Make an iterator that will filter out funny looking things
final SamLocusIterator iterator;
if (INTERVALS != null) {
final IntervalList intervals = IntervalList.fromFile(INTERVALS);
iterator = new SamLocusIterator(in, intervals.uniqued(), false);
} else {
iterator = new SamLocusIterator(in);
}
iterator.setEmitUncoveredLoci(false);
iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY);
final List<SamRecordFilter> filters = new ArrayList<>();
filters.add(new NotPrimaryAlignmentFilter());
filters.add(new DuplicateReadFilter());
if (MINIMUM_INSERT_SIZE > 0 || MAXIMUM_INSERT_SIZE > 0) {
filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, MAXIMUM_INSERT_SIZE));
}
iterator.setSamFilters(filters);
logger.info("Starting iteration.");
long nextLogTime = 0;
int sites = 0;
for (final SamLocusIterator.LocusInfo info : iterator) {
// Skip dbSNP sites
final String chrom = info.getSequenceName();
final int pos = info.getPosition();
final int index = pos - 1;
if (dbSnp != null && dbSnp.isDbSnpSite(chrom, pos))
continue;
// Skip sites at the end of chromosomes
final byte[] bases = refWalker.get(info.getSequenceIndex()).getBases();
if (pos < 3 || pos > bases.length - 3)
continue;
// Skip non C-G bases
final byte base = StringUtil.toUpperCase(bases[index]);
if (base != 'C' && base != 'G')
continue;
// Get the context string
final String context;
{
final String tmp = StringUtil.bytesToString(bases, index - CONTEXT_SIZE, 1 + (2 * CONTEXT_SIZE)).toUpperCase();
if (base == 'C')
context = tmp;
else
/* if G */
context = SequenceUtil.reverseComplement(tmp);
}
final List<Calculator> calculatorsForContext = calculators.get(context);
// happens if we get ambiguous bases in the reference
if (calculatorsForContext == null)
continue;
for (final Calculator calc : calculatorsForContext) calc.accept(info, base);
// See if we need to stop
if (++sites % 100 == 0) {
final long now = System.currentTimeMillis();
if (now > nextLogTime) {
logger.info("Visited " + sites + " sites of interest. Last site: " + chrom + ":" + pos);
nextLogTime = now + 60000;
}
}
if (sites >= STOP_AFTER)
break;
}
final MetricsFile<CpcgMetrics, Integer> file = getMetricsFile();
for (final List<Calculator> calcs : calculators.values()) {
for (final Calculator calc : calcs) {
final CpcgMetrics m = calc.finish();
m.SAMPLE_ALIAS = StringUtil.join(",", new ArrayList<>(samples));
file.addMetric(m);
}
}
file.write(OUTPUT);
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.filter.SamRecordFilter in project gatk-protected by broadinstitute.
the class BayesianHetPulldownCalculator method getSamLocusIteratorWithDefaultFilters.
/**
* Returns a {@link SamLocusIterator} object for a given {@link SamReader} and {@link IntervalList} with filters
* on minimum base quality and minimum mapping quality
*
* @param samReader a SamReader object
* @return a SamLocusIterator object
*/
private SamLocusIterator getSamLocusIteratorWithDefaultFilters(final SamReader samReader) {
final SamLocusIterator locusIterator = new SamLocusIterator(samReader, snpIntervals, false);
/* set read and locus filters */
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(false);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
return locusIterator;
}
use of htsjdk.samtools.filter.SamRecordFilter in project gatk-protected by broadinstitute.
the class HetPulldownCalculator method getHetPulldown.
/**
* For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
* where intervals give positions of likely heterozygous SNP sites.
*
* <p>
* For a normal sample:
* <ul>
* The IntervalList snpIntervals gives common SNP sites in 1-based format.
* </ul>
* <ul>
* The p-value threshold must be specified for a two-sided binomial test,
* which is used to determine SNP sites from snpIntervals that are
* compatible with a heterozygous SNP, given the sample. Only these sites are output.
* </ul>
* </p>
* <p>
* For a tumor sample:
* <ul>
* The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
* This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
* Only these sites are output.
* </ul>
* </p>
* @param bamFile sorted BAM file for sample
* @param snpIntervals IntervalList of SNP sites
* @param sampleType flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
* (determines whether to perform binomial test)
* @param pvalThreshold p-value threshold for two-sided binomial test, used for normal sample
* @param minimumRawReads minimum number of total reads that must be present at a het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final int totalNumberOfSNPs = snpIntervals.size();
final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(false);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
//include N, etc. reads here
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount < minimumRawReads) {
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
final int refReadCount = (int) baseCounts.get(refBase);
final int altReadCount = totalBaseCount - refReadCount;
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of htsjdk.samtools.filter.SamRecordFilter in project gatk by broadinstitute.
the class RevertSam method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final boolean sanitizing = SANITIZE;
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SAMFileHeader inHeader = in.getFileHeader();
// If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read
// groups have the same values.
final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups();
if (SAMPLE_ALIAS != null || LIBRARY_NAME != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (SAMPLE_ALIAS != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied.");
}
if (LIBRARY_NAME != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied.");
}
}
////////////////////////////////////////////////////////////////////////////
// Build the output writer with an appropriate header based on the options
////////////////////////////////////////////////////////////////////////////
final boolean presorted = (inHeader.getSortOrder() == SORT_ORDER) || (SORT_ORDER == SAMFileHeader.SortOrder.queryname && SANITIZE);
final SAMFileHeader outHeader = new SAMFileHeader();
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
if (SAMPLE_ALIAS != null) {
rg.setSample(SAMPLE_ALIAS);
}
if (LIBRARY_NAME != null) {
rg.setLibrary(LIBRARY_NAME);
}
outHeader.addReadGroup(rg);
}
outHeader.setSortOrder(SORT_ORDER);
if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, outHeader, presorted);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
} else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(logger, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary())
continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing)
sorter.add(rec);
else
out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
} else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
@Override
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
@Override
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
logger.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
logger.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(logger, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
logger.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag())
++unpaired;
if (rec.getFirstOfPairFlag())
++firsts;
if (rec.getSecondOfPairFlag())
++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
logger.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
logger.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");
if (discarded / (double) total > MAX_DISCARD_FRACTION) {
throw new GATKException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION));
}
}
CloserUtil.close(in);
return null;
}
Aggregations