use of htsjdk.samtools.util.SamLocusIterator in project gatk-protected by broadinstitute.
the class HetPulldownCalculatorUnitTest method inputGetPileupBaseCount.
@DataProvider(name = "inputGetPileupBaseCount")
public Object[][] inputGetPileupBaseCount() throws IOException {
try (final SamReader bamReader = SamReaderFactory.makeDefault().open(NORMAL_BAM_FILE)) {
final IntervalList intervals = new IntervalList(bamReader.getFileHeader());
intervals.add(new Interval("1", 100, 100));
intervals.add(new Interval("1", 11000, 11000));
intervals.add(new Interval("1", 14000, 14000));
intervals.add(new Interval("1", 14630, 14630));
final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, intervals);
final Nucleotide.Counter baseCounts1 = makeBaseCounts(0, 0, 0, 0);
final Nucleotide.Counter baseCounts2 = makeBaseCounts(0, 9, 0, 0);
final Nucleotide.Counter baseCounts3 = makeBaseCounts(12, 0, 0, 0);
final Nucleotide.Counter baseCounts4 = makeBaseCounts(0, 0, 8, 9);
if (!locusIterator.hasNext()) {
throw new SAMException("Can't get locus to start iteration. Check that " + NORMAL_BAM_FILE.toString() + " contains 1:0-16000.");
}
final SamLocusIterator.LocusInfo locus1 = locusIterator.next();
final SamLocusIterator.LocusInfo locus2 = locusIterator.next();
final SamLocusIterator.LocusInfo locus3 = locusIterator.next();
final SamLocusIterator.LocusInfo locus4 = locusIterator.next();
locusIterator.close();
return new Object[][] { { locus1, baseCounts1 }, { locus2, baseCounts2 }, { locus3, baseCounts3 }, { locus4, baseCounts4 } };
}
}
use of htsjdk.samtools.util.SamLocusIterator in project gatk by broadinstitute.
the class CountingPairedFilter method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
// Setup all the inputs
final ProgressLogger progress = new ProgressLogger(logger, 10000000, "Processed", "loci");
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT);
// Load up the reference sequence and double check sequence dictionaries
if (!in.getFileHeader().getSequenceDictionary().isEmpty()) {
SequenceUtil.assertSequenceDictionariesEqual(in.getFileHeader().getSequenceDictionary(), refWalker.getSequenceDictionary());
}
final SamLocusIterator iterator = new SamLocusIterator(in);
final List<SamRecordFilter> filters = new ArrayList<>();
final CountingFilter dupeFilter = new CountingDuplicateFilter();
final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
final CountingPairedFilter pairFilter = new CountingPairedFilter();
filters.add(mapqFilter);
filters.add(dupeFilter);
filters.add(pairFilter);
// Not a counting filter because we never want to count reads twice
filters.add(new SecondaryAlignmentFilter());
iterator.setSamFilters(filters);
iterator.setEmitUncoveredLoci(true);
// Handled separately because we want to count bases
iterator.setMappingQualityScoreCutoff(0);
// Handled separately because we want to count bases
iterator.setQualityScoreCutoff(0);
iterator.setIncludeNonPfReads(false);
final int max = COVERAGE_CAP;
final long[] HistogramArray = new long[max + 1];
final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
final boolean usingStopAfter = STOP_AFTER > 0;
final long stopAfter = STOP_AFTER - 1;
long counter = 0;
long basesExcludedByBaseq = 0;
long basesExcludedByOverlap = 0;
long basesExcludedByCapping = 0;
// Loop through all the loci
while (iterator.hasNext()) {
final SamLocusIterator.LocusInfo info = iterator.next();
// Check that the reference is not N
final ReferenceSequence ref = refWalker.get(info.getSequenceIndex());
final byte base = ref.getBases()[info.getPosition() - 1];
if (base == 'N')
continue;
// Figure out the coverage while not counting overlapping reads twice, and excluding various things
final Set<String> readNames = new HashSet<>(info.getRecordAndOffsets().size());
int pileupSize = 0;
for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndOffsets()) {
if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) {
++basesExcludedByBaseq;
continue;
}
if (!readNames.add(recs.getRecord().getReadName())) {
++basesExcludedByOverlap;
continue;
}
pileupSize++;
if (pileupSize <= max) {
baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
}
}
final int depth = Math.min(readNames.size(), max);
if (depth < readNames.size())
basesExcludedByCapping += readNames.size() - max;
HistogramArray[depth]++;
// Record progress and perhaps stop
progress.record(info.getSequenceName(), info.getPosition());
if (usingStopAfter && ++counter > stopAfter)
break;
}
// Construct and write the outputs
final Histogram<Integer> histo = new Histogram<>("coverage", "count");
for (int i = 0; i < HistogramArray.length; ++i) {
histo.increment(i, HistogramArray[i]);
}
// Construct and write the outputs
final Histogram<Integer> baseQHisto = new Histogram<>("value", "baseq_count");
for (int i = 0; i < baseQHistogramArray.length; ++i) {
baseQHisto.increment(i, baseQHistogramArray[i]);
}
final WgsMetrics metrics = generateWgsMetrics();
metrics.GENOME_TERRITORY = (long) histo.getSumOfValues();
metrics.MEAN_COVERAGE = histo.getMean();
metrics.SD_COVERAGE = histo.getStandardDeviation();
metrics.MEDIAN_COVERAGE = histo.getMedian();
metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();
final long basesExcludedByDupes = dupeFilter.getFilteredBases();
final long basesExcludedByMapq = mapqFilter.getFilteredBases();
final long basesExcludedByPairing = pairFilter.getFilteredBases();
final double total = histo.getSum();
final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping;
metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;
metrics.PCT_5X = MathUtils.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_10X = MathUtils.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_15X = MathUtils.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_20X = MathUtils.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_25X = MathUtils.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_30X = MathUtils.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_40X = MathUtils.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_50X = MathUtils.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_60X = MathUtils.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_70X = MathUtils.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_80X = MathUtils.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_90X = MathUtils.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
metrics.PCT_100X = MathUtils.sum(HistogramArray, 100, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
out.addMetric(metrics);
out.addHistogram(histo);
if (INCLUDE_BQ_HISTOGRAM) {
out.addHistogram(baseQHisto);
}
out.write(OUTPUT);
return null;
}
use of htsjdk.samtools.util.SamLocusIterator in project gatk-protected by broadinstitute.
the class BayesianHetPulldownCalculator method getTumorHetPulldownFromNormalPulldown.
/**
* Calculates the Het pulldown from a tumor file, given the tumor BAM file and the pulldown from a matched
* normal BAM file.
*
* Note: this method does not perform any statistical inference. The Het SNP sites are directly carried over
* from the matched normal pulldown. Here, we only collect statistics (ref count, alt count, read depth) and
* save to a pulldown. The verbosity level of the pulldown is INTERMEDIATE (see {@link AllelicCountTableColumn}).
*
* @param tumorBamFile the tumor BAM file
* @param normalHetPulldown the matched normal Het pulldown
* @return tumor Het pulldown
*/
public Pulldown getTumorHetPulldownFromNormalPulldown(final File tumorBamFile, final Pulldown normalHetPulldown) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(tumorBamFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + tumorBamFile.toString() + " must be coordinate sorted.");
}
final Pulldown tumorHetPulldown = new Pulldown(bamReader.getFileHeader());
final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);
/* get a map of SimpleIntervals in the pulldown to their index */
final Map<SimpleInterval, Integer> normalPulldownIndexMap = normalHetPulldown.getSimpleIntervalToIndexMap();
final int totalNumberOfSNPs = snpIntervals.size();
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount <= readDepthThreshold) {
continue;
}
/* find the AllelicCount from the normal pulldown */
int indexInNormalPulldown;
try {
indexInNormalPulldown = normalPulldownIndexMap.get(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()));
} catch (NullPointerException e) {
throw new GATKException.ShouldNeverReachHereException("Can not find the required AllelicCount " + "object in the normal pulldown. Stopping.");
}
/* just count the alt and ref nucleotide and add to the tumor pulldown */
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
tumorHetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), (int) baseCounts.get(normalHetPulldown.getCounts().get(indexInNormalPulldown).getRefNucleotide()), (int) baseCounts.get(normalHetPulldown.getCounts().get(indexInNormalPulldown).getAltNucleotide()), normalHetPulldown.getCounts().get(indexInNormalPulldown).getRefNucleotide(), normalHetPulldown.getCounts().get(indexInNormalPulldown).getAltNucleotide(), totalReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return tumorHetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of htsjdk.samtools.util.SamLocusIterator in project gatk by broadinstitute.
the class BayesianHetPulldownCalculator method getHetPulldown.
/**
* For a given normal or tumor BAM file, walks through the list of common SNPs,
* {@link BayesianHetPulldownCalculator#snpIntervals}), detects heterozygous sites, and returns
* a {@link Pulldown} containing detailed information on the called heterozygous SNP sites.
*
* The {@code hetCallingStrigency} parameters sets the threshold posterior for calling a Het SNP site:
*
* hetPosteriorThreshold = 1 - 10^{-hetCallingStringency}
* hetThresholdLogOdds = log(hetPosteriorThreshold/(1-hetPosteriorThreshold))
* = log(10^{hetCallingStringency} - 1)
*
* (see CNV-methods.pdf for details)
*
* @param bamFile sorted BAM file for sample
* @param hetCallingStringency strigency for calling a Het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
public Pulldown getHetPulldown(final File bamFile, final double hetCallingStringency) {
/* log odds from stringency */
final double hetThresholdLogOdds = FastMath.log(FastMath.pow(10, hetCallingStringency) - 1);
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);
final int totalNumberOfSNPs = snpIntervals.size();
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount <= readDepthThreshold) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!isProperBase(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Even though" + " this position is indicated to be a possible heterozygous SNP in the provided SNP interval list," + " no inference can be made. Continuing ...", locus.getPosition(), refBase.toString()));
continue;
}
final Map<Nucleotide, List<BaseQuality>> baseQualities = getPileupBaseQualities(locus);
final Nucleotide altBase = inferAltFromPileup(baseQualities, refBase);
/* calculate Het log odds */
final double hetLogLikelihood = getHetLogLikelihood(baseQualities, refBase, altBase);
final double homLogLikelihood = getHomLogLikelihood(baseQualities, refBase, altBase, DEFAULT_PRIOR_REF_HOM);
final double hetLogOdds = (hetLogLikelihood + FastMath.log(DEFAULT_PRIOR_HET)) - (homLogLikelihood + FastMath.log(1 - DEFAULT_PRIOR_HET));
if (hetLogOdds > hetThresholdLogOdds) {
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), baseQualities.get(refBase).size(), baseQualities.get(altBase).size(), refBase, altBase, totalReadCount, hetLogOdds));
}
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of htsjdk.samtools.util.SamLocusIterator in project gatk by broadinstitute.
the class BayesianHetPulldownCalculator method getTumorHetPulldownFromNormalPulldown.
/**
* Calculates the Het pulldown from a tumor file, given the tumor BAM file and the pulldown from a matched
* normal BAM file.
*
* Note: this method does not perform any statistical inference. The Het SNP sites are directly carried over
* from the matched normal pulldown. Here, we only collect statistics (ref count, alt count, read depth) and
* save to a pulldown. The verbosity level of the pulldown is INTERMEDIATE (see {@link AllelicCountTableColumn}).
*
* @param tumorBamFile the tumor BAM file
* @param normalHetPulldown the matched normal Het pulldown
* @return tumor Het pulldown
*/
public Pulldown getTumorHetPulldownFromNormalPulldown(final File tumorBamFile, final Pulldown normalHetPulldown) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(tumorBamFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + tumorBamFile.toString() + " must be coordinate sorted.");
}
final Pulldown tumorHetPulldown = new Pulldown(bamReader.getFileHeader());
final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);
/* get a map of SimpleIntervals in the pulldown to their index */
final Map<SimpleInterval, Integer> normalPulldownIndexMap = normalHetPulldown.getSimpleIntervalToIndexMap();
final int totalNumberOfSNPs = snpIntervals.size();
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount <= readDepthThreshold) {
continue;
}
/* find the AllelicCount from the normal pulldown */
int indexInNormalPulldown;
try {
indexInNormalPulldown = normalPulldownIndexMap.get(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()));
} catch (NullPointerException e) {
throw new GATKException.ShouldNeverReachHereException("Can not find the required AllelicCount " + "object in the normal pulldown. Stopping.");
}
/* just count the alt and ref nucleotide and add to the tumor pulldown */
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
tumorHetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), (int) baseCounts.get(normalHetPulldown.getCounts().get(indexInNormalPulldown).getRefNucleotide()), (int) baseCounts.get(normalHetPulldown.getCounts().get(indexInNormalPulldown).getAltNucleotide()), normalHetPulldown.getCounts().get(indexInNormalPulldown).getRefNucleotide(), normalHetPulldown.getCounts().get(indexInNormalPulldown).getAltNucleotide(), totalReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return tumorHetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
Aggregations