use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class PairedVariantSubContextIterator method doWork.
// TODO: add optimization if the samples are in the same file
// TODO: add option for auto-detect pairs based on same sample name
// TODO: allow multiple sample-pairs in one pass
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(TRUTH_VCF);
IOUtil.assertFileIsReadable(CALL_VCF);
final File summaryMetricsFile = new File(OUTPUT + SUMMARY_METRICS_FILE_EXTENSION);
final File detailedMetricsFile = new File(OUTPUT + DETAILED_METRICS_FILE_EXTENSION);
final File contingencyMetricsFile = new File(OUTPUT + CONTINGENCY_METRICS_FILE_EXTENSION);
IOUtil.assertFileIsWritable(summaryMetricsFile);
IOUtil.assertFileIsWritable(detailedMetricsFile);
IOUtil.assertFileIsWritable(contingencyMetricsFile);
final boolean usingIntervals = this.INTERVALS != null && !this.INTERVALS.isEmpty();
IntervalList intervals = null;
SAMSequenceDictionary intervalsSamSequenceDictionary = null;
if (usingIntervals) {
logger.info("Loading up region lists.");
long genomeBaseCount = 0;
for (final File f : INTERVALS) {
IOUtil.assertFileIsReadable(f);
final IntervalList tmpIntervalList = IntervalList.fromFile(f);
if (genomeBaseCount == 0) {
// Don't count the reference length more than once.
intervalsSamSequenceDictionary = tmpIntervalList.getHeader().getSequenceDictionary();
genomeBaseCount = intervalsSamSequenceDictionary.getReferenceLength();
}
if (intervals == null)
intervals = tmpIntervalList;
else if (INTERSECT_INTERVALS)
intervals = IntervalList.intersection(intervals, tmpIntervalList);
else
intervals = IntervalList.union(intervals, tmpIntervalList);
}
if (intervals != null) {
intervals = intervals.uniqued();
}
logger.info("Finished loading up region lists.");
}
final VCFFileReader truthReader = new VCFFileReader(TRUTH_VCF, USE_VCF_INDEX);
final VCFFileReader callReader = new VCFFileReader(CALL_VCF, USE_VCF_INDEX);
// Check that the samples actually exist in the files!
if (!truthReader.getFileHeader().getGenotypeSamples().contains(TRUTH_SAMPLE)) {
throw new UserException("File " + TRUTH_VCF.getAbsolutePath() + " does not contain genotypes for sample " + TRUTH_SAMPLE);
}
if (!callReader.getFileHeader().getGenotypeSamples().contains(CALL_SAMPLE)) {
throw new UserException("File " + CALL_VCF.getAbsolutePath() + " does not contain genotypes for sample " + CALL_SAMPLE);
}
// Verify that both VCFs have the same Sequence Dictionary
SequenceUtil.assertSequenceDictionariesEqual(truthReader.getFileHeader().getSequenceDictionary(), callReader.getFileHeader().getSequenceDictionary());
if (usingIntervals) {
// If using intervals, verify that the sequence dictionaries agree with those of the VCFs
SequenceUtil.assertSequenceDictionariesEqual(intervalsSamSequenceDictionary, truthReader.getFileHeader().getSequenceDictionary());
}
// Build the pair of iterators over the regions of interest
final Iterator<VariantContext> truthIterator, callIterator;
if (usingIntervals) {
truthIterator = new ByIntervalListVariantContextIterator(truthReader, intervals);
callIterator = new ByIntervalListVariantContextIterator(callReader, intervals);
} else {
truthIterator = truthReader.iterator();
callIterator = callReader.iterator();
}
// Now do the iteration and count things up
final PairedVariantSubContextIterator pairedIterator = new PairedVariantSubContextIterator(truthIterator, TRUTH_SAMPLE, callIterator, CALL_SAMPLE, truthReader.getFileHeader().getSequenceDictionary());
snpCounter = new GenotypeConcordanceCounts();
indelCounter = new GenotypeConcordanceCounts();
// A map to keep track of the count of Truth/Call States which we could not successfully classify
final Map<String, Integer> unClassifiedStatesMap = new HashMap<>();
logger.info("Starting iteration over variants.");
while (pairedIterator.hasNext()) {
final VcTuple tuple = pairedIterator.next();
final VariantContext.Type truthVariantContextType = tuple.truthVariantContext != null ? tuple.truthVariantContext.getType() : NO_VARIATION;
final VariantContext.Type callVariantContextType = tuple.callVariantContext != null ? tuple.callVariantContext.getType() : NO_VARIATION;
// A flag to keep track of whether we have been able to successfully classify the Truth/Call States.
// Unclassified include MIXED/MNP/Symbolic...
boolean stateClassified = false;
final TruthAndCallStates truthAndCallStates = determineState(tuple.truthVariantContext, TRUTH_SAMPLE, tuple.callVariantContext, CALL_SAMPLE, MIN_GQ, MIN_DP);
if (truthVariantContextType == SNP) {
if ((callVariantContextType == SNP) || (callVariantContextType == MIXED) || (callVariantContextType == NO_VARIATION)) {
// Note. If truth is SNP and call is MIXED, the event will be logged in the indelCounter, with row = MIXED
snpCounter.increment(truthAndCallStates);
stateClassified = true;
}
} else if (truthVariantContextType == INDEL) {
// Note. If truth is Indel and call is MIXED, the event will be logged in the indelCounter, with row = MIXED
if ((callVariantContextType == INDEL) || (callVariantContextType == MIXED) || (callVariantContextType == NO_VARIATION)) {
indelCounter.increment(truthAndCallStates);
stateClassified = true;
}
} else if (truthVariantContextType == MIXED) {
// Note. If truth is MIXED and call is SNP, the event will be logged in the snpCounter, with column = MIXED
if (callVariantContextType == SNP) {
snpCounter.increment(truthAndCallStates);
stateClassified = true;
} else // Note. If truth is MIXED and call is INDEL, the event will be logged in the snpCounter, with column = MIXED
if (callVariantContextType == INDEL) {
indelCounter.increment(truthAndCallStates);
stateClassified = true;
}
} else if (truthVariantContextType == NO_VARIATION) {
if (callVariantContextType == SNP) {
snpCounter.increment(truthAndCallStates);
stateClassified = true;
} else if (callVariantContextType == INDEL) {
indelCounter.increment(truthAndCallStates);
stateClassified = true;
}
}
if (!stateClassified) {
final String condition = truthVariantContextType + " " + callVariantContextType;
Integer count = unClassifiedStatesMap.get(condition);
if (count == null)
count = 0;
unClassifiedStatesMap.put(condition, ++count);
}
final VariantContext variantContextForLogging = tuple.truthVariantContext != null ? tuple.truthVariantContext : tuple.callVariantContext;
progress.record(variantContextForLogging.getContig(), variantContextForLogging.getStart());
}
// Calculate and store the summary-level metrics
final MetricsFile<GenotypeConcordanceSummaryMetrics, ?> genotypeConcordanceSummaryMetricsFile = getMetricsFile();
GenotypeConcordanceSummaryMetrics summaryMetrics = new GenotypeConcordanceSummaryMetrics(SNP, snpCounter, TRUTH_SAMPLE, CALL_SAMPLE);
genotypeConcordanceSummaryMetricsFile.addMetric(summaryMetrics);
summaryMetrics = new GenotypeConcordanceSummaryMetrics(INDEL, indelCounter, TRUTH_SAMPLE, CALL_SAMPLE);
genotypeConcordanceSummaryMetricsFile.addMetric(summaryMetrics);
genotypeConcordanceSummaryMetricsFile.write(summaryMetricsFile);
// Calculate and store the detailed metrics for both SNP and indels
final MetricsFile<GenotypeConcordanceDetailMetrics, ?> genotypeConcordanceDetailMetrics = getMetricsFile();
outputDetailMetricsFile(SNP, genotypeConcordanceDetailMetrics, snpCounter, TRUTH_SAMPLE, CALL_SAMPLE);
outputDetailMetricsFile(INDEL, genotypeConcordanceDetailMetrics, indelCounter, TRUTH_SAMPLE, CALL_SAMPLE);
genotypeConcordanceDetailMetrics.write(detailedMetricsFile);
// Calculate and score the contingency metrics
final MetricsFile<GenotypeConcordanceContingencyMetrics, ?> genotypeConcordanceContingencyMetricsFile = getMetricsFile();
GenotypeConcordanceContingencyMetrics contingencyMetrics = new GenotypeConcordanceContingencyMetrics(SNP, snpCounter, TRUTH_SAMPLE, CALL_SAMPLE);
genotypeConcordanceContingencyMetricsFile.addMetric(contingencyMetrics);
contingencyMetrics = new GenotypeConcordanceContingencyMetrics(INDEL, indelCounter, TRUTH_SAMPLE, CALL_SAMPLE);
genotypeConcordanceContingencyMetricsFile.addMetric(contingencyMetrics);
genotypeConcordanceContingencyMetricsFile.write(contingencyMetricsFile);
for (final String condition : unClassifiedStatesMap.keySet()) {
logger.info("Uncovered truth/call Variant Context Type Counts: " + condition + " " + unClassifiedStatesMap.get(condition));
}
return null;
}
use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class HetPulldownCalculator method getHetPulldown.
/**
* For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
* where intervals give positions of likely heterozygous SNP sites.
*
* <p>
* For a normal sample:
* <ul>
* The IntervalList snpIntervals gives common SNP sites in 1-based format.
* </ul>
* <ul>
* The p-value threshold must be specified for a two-sided binomial test,
* which is used to determine SNP sites from snpIntervals that are
* compatible with a heterozygous SNP, given the sample. Only these sites are output.
* </ul>
* </p>
* <p>
* For a tumor sample:
* <ul>
* The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
* This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
* Only these sites are output.
* </ul>
* </p>
* @param bamFile sorted BAM file for sample
* @param snpIntervals IntervalList of SNP sites
* @param sampleType flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
* (determines whether to perform binomial test)
* @param pvalThreshold p-value threshold for two-sided binomial test, used for normal sample
* @param minimumRawReads minimum number of total reads that must be present at a het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final int totalNumberOfSNPs = snpIntervals.size();
final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(false);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
//include N, etc. reads here
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount < minimumRawReads) {
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
final int refReadCount = (int) baseCounts.get(refBase);
final int altReadCount = totalBaseCount - refReadCount;
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class IntervalUtilsUnitTest method testLoadIntervalsInvalidPicardIntervalHandling.
@Test(expectedExceptions = UserException.MalformedFile.class, dataProvider = "invalidIntervalTestData")
public void testLoadIntervalsInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser, String contig, int intervalStart, int intervalEnd) throws Exception {
SAMFileHeader picardFileHeader = new SAMFileHeader();
picardFileHeader.addSequence(genomeLocParser.getContigInfo("1"));
// Intentionally add an extra contig not in our genomeLocParser's sequence dictionary
// so that we can add intervals to the Picard interval file for contigs not in our dictionary
picardFileHeader.addSequence(new SAMSequenceRecord("2", 100000));
IntervalList picardIntervals = new IntervalList(picardFileHeader);
picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname"));
File picardIntervalFile = createTempFile("testLoadIntervalsInvalidPicardIntervalHandling", ".intervals");
picardIntervals.write(picardIntervalFile);
List<String> intervalArgs = new ArrayList<>(1);
intervalArgs.add(picardIntervalFile.getAbsolutePath());
// loadIntervals() will validate all intervals against the sequence dictionary in our genomeLocParser,
// and should throw for all intervals in our invalidIntervalTestData set
IntervalUtils.loadIntervals(intervalArgs, IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser);
}
use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class IntervalUtilsUnitTest method testIntervalFileToListInvalidPicardIntervalHandling.
@Test(expectedExceptions = UserException.CouldNotReadInputFile.class, dataProvider = "invalidIntervalTestData")
public void testIntervalFileToListInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser, String contig, int intervalStart, int intervalEnd) throws Exception {
final SAMFileHeader picardFileHeader = new SAMFileHeader();
picardFileHeader.addSequence(genomeLocParser.getContigInfo("1"));
picardFileHeader.addSequence(new SAMSequenceRecord("2", 100000));
final IntervalList picardIntervals = new IntervalList(picardFileHeader);
picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname"));
final File picardIntervalFile = createTempFile("testIntervalFileToListInvalidPicardIntervalHandling", ".intervals");
picardIntervals.write(picardIntervalFile);
// loadIntervals() will validate all intervals against the sequence dictionary in our genomeLocParser,
// and should throw for all intervals in our invalidIntervalTestData set
IntervalUtils.intervalFileToList(genomeLocParser, picardIntervalFile.getAbsolutePath());
}
use of htsjdk.samtools.util.IntervalList in project gatk-protected by broadinstitute.
the class AllelicCountCollector method collect.
/**
* Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list)
* in a sorted BAM file. Reads and bases below the specified mapping quality and base quality, respectively,
* are filtered out of the pileup. The alt count is defined as the total count minus the ref count, and the
* alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the
* bases in {@link AllelicCountCollector#BASES}.
* @param bamFile sorted BAM file
* @param siteIntervals interval list of sites
* @param minMappingQuality minimum mapping quality required for reads to be included in pileup
* @param minBaseQuality minimum base quality required for bases to be included in pileup
* @return AllelicCountCollection of ref/alt counts at sites in BAM file
*/
public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) {
try (final SamReader reader = readerFactory.open(bamFile)) {
ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative.");
ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative.");
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final int numberOfSites = siteIntervals.size();
final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX;
final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(true);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + numberOfSites + " sites in total...");
int locusCount = 0;
final AllelicCountCollection counts = new AllelicCountCollection();
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " sites.");
}
locusCount++;
final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!BASES.contains(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString()));
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum();
final int refReadCount = (int) baseCounts.get(refBase);
//we take alt = total - ref instead of the actual alt count
final int altReadCount = totalBaseCount - refReadCount;
final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase);
counts.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase));
}
logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined.");
return counts;
} catch (final IOException | SAMFormatException e) {
throw new UserException("Unable to collect allelic counts from " + bamFile);
}
}
Aggregations