use of htsjdk.samtools.reference.ReferenceSequenceFileWalker in project gatk by broadinstitute.
the class SinglePassSamProgram method makeItSo.
public static void makeItSo(final File input, final File referenceSequence, final boolean assumeSorted, final long stopAfter, final Collection<SinglePassSamProgram> programs) {
// Setup the standard inputs
IOUtil.assertFileIsReadable(input);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(referenceSequence).open(input);
// Optionally load up the reference sequence and double check sequence dictionaries
final ReferenceSequenceFileWalker walker;
if (referenceSequence == null) {
walker = null;
} else {
IOUtil.assertFileIsReadable(referenceSequence);
walker = new ReferenceSequenceFileWalker(referenceSequence);
if (!in.getFileHeader().getSequenceDictionary().isEmpty()) {
SequenceUtil.assertSequenceDictionariesEqual(in.getFileHeader().getSequenceDictionary(), walker.getSequenceDictionary());
}
}
// Check on the sort order of the BAM file
{
final SortOrder sort = in.getFileHeader().getSortOrder();
if (sort != SortOrder.coordinate) {
if (assumeSorted) {
logger.warn("File reports sort order '" + sort + "', assuming it's coordinate sorted anyway.");
} else {
throw new UserException("File " + input.getAbsolutePath() + " should be coordinate sorted but " + "the header says the sort order is " + sort + ". If you believe the file " + "to be coordinate sorted you may pass ASSUME_SORTED=true");
}
}
}
// Call the abstract setup method!
boolean anyUseNoRefReads = false;
for (final SinglePassSamProgram program : programs) {
program.setup(in.getFileHeader(), input);
anyUseNoRefReads = anyUseNoRefReads || program.usesNoRefReads();
}
final ProgressLogger progress = new ProgressLogger(logger);
for (final SAMRecord rec : in) {
final ReferenceSequence ref;
if (walker == null || rec.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
ref = null;
} else {
ref = walker.get(rec.getReferenceIndex());
}
for (final SinglePassSamProgram program : programs) {
program.acceptRead(rec, ref);
}
progress.record(rec);
// See if we need to terminate early?
if (stopAfter > 0 && progress.getCount() >= stopAfter) {
break;
}
// And see if we're into the unmapped reads at the end
if (!anyUseNoRefReads && rec.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
break;
}
}
CloserUtil.close(in);
for (final SinglePassSamProgram program : programs) {
program.finish();
}
}
use of htsjdk.samtools.reference.ReferenceSequenceFileWalker in project gatk by broadinstitute.
the class CollectOxoGMetrics method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
if (INTERVALS != null)
IOUtil.assertFileIsReadable(INTERVALS);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SamReader in = SamReaderFactory.makeDefault().open(INPUT);
final Set<String> samples = new HashSet<>();
final Set<String> libraries = new HashSet<>();
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY));
}
// Setup the calculators
final Set<String> contexts = CONTEXTS.isEmpty() ? makeContextStrings(CONTEXT_SIZE) : CONTEXTS;
final ListMap<String, Calculator> calculators = new ListMap<>();
for (final String context : contexts) {
for (final String library : libraries) {
calculators.add(context, new Calculator(library, context));
}
}
// Load up dbSNP if available
logger.info("Loading dbSNP File: " + DB_SNP);
final DbSnpBitSetUtil dbSnp;
if (DB_SNP != null)
dbSnp = new DbSnpBitSetUtil(DB_SNP, in.getFileHeader().getSequenceDictionary());
else
dbSnp = null;
// Make an iterator that will filter out funny looking things
final SamLocusIterator iterator;
if (INTERVALS != null) {
final IntervalList intervals = IntervalList.fromFile(INTERVALS);
iterator = new SamLocusIterator(in, intervals.uniqued(), false);
} else {
iterator = new SamLocusIterator(in);
}
iterator.setEmitUncoveredLoci(false);
iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY);
final List<SamRecordFilter> filters = new ArrayList<>();
filters.add(new NotPrimaryAlignmentFilter());
filters.add(new DuplicateReadFilter());
if (MINIMUM_INSERT_SIZE > 0 || MAXIMUM_INSERT_SIZE > 0) {
filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, MAXIMUM_INSERT_SIZE));
}
iterator.setSamFilters(filters);
logger.info("Starting iteration.");
long nextLogTime = 0;
int sites = 0;
for (final SamLocusIterator.LocusInfo info : iterator) {
// Skip dbSNP sites
final String chrom = info.getSequenceName();
final int pos = info.getPosition();
final int index = pos - 1;
if (dbSnp != null && dbSnp.isDbSnpSite(chrom, pos))
continue;
// Skip sites at the end of chromosomes
final byte[] bases = refWalker.get(info.getSequenceIndex()).getBases();
if (pos < 3 || pos > bases.length - 3)
continue;
// Skip non C-G bases
final byte base = StringUtil.toUpperCase(bases[index]);
if (base != 'C' && base != 'G')
continue;
// Get the context string
final String context;
{
final String tmp = StringUtil.bytesToString(bases, index - CONTEXT_SIZE, 1 + (2 * CONTEXT_SIZE)).toUpperCase();
if (base == 'C')
context = tmp;
else
/* if G */
context = SequenceUtil.reverseComplement(tmp);
}
final List<Calculator> calculatorsForContext = calculators.get(context);
// happens if we get ambiguous bases in the reference
if (calculatorsForContext == null)
continue;
for (final Calculator calc : calculatorsForContext) calc.accept(info, base);
// See if we need to stop
if (++sites % 100 == 0) {
final long now = System.currentTimeMillis();
if (now > nextLogTime) {
logger.info("Visited " + sites + " sites of interest. Last site: " + chrom + ":" + pos);
nextLogTime = now + 60000;
}
}
if (sites >= STOP_AFTER)
break;
}
final MetricsFile<CpcgMetrics, Integer> file = getMetricsFile();
for (final List<Calculator> calcs : calculators.values()) {
for (final Calculator calc : calcs) {
final CpcgMetrics m = calc.finish();
m.SAMPLE_ALIAS = StringUtil.join(",", new ArrayList<>(samples));
file.addMetric(m);
}
}
file.write(OUTPUT);
CloserUtil.close(in);
return null;
}
use of htsjdk.samtools.reference.ReferenceSequenceFileWalker in project gatk-protected by broadinstitute.
the class BayesianHetPulldownCalculator method getHetPulldown.
/**
* For a given normal or tumor BAM file, walks through the list of common SNPs,
* {@link BayesianHetPulldownCalculator#snpIntervals}), detects heterozygous sites, and returns
* a {@link Pulldown} containing detailed information on the called heterozygous SNP sites.
*
* The {@code hetCallingStrigency} parameters sets the threshold posterior for calling a Het SNP site:
*
* hetPosteriorThreshold = 1 - 10^{-hetCallingStringency}
* hetThresholdLogOdds = log(hetPosteriorThreshold/(1-hetPosteriorThreshold))
* = log(10^{hetCallingStringency} - 1)
*
* (see CNV-methods.pdf for details)
*
* @param bamFile sorted BAM file for sample
* @param hetCallingStringency strigency for calling a Het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
public Pulldown getHetPulldown(final File bamFile, final double hetCallingStringency) {
/* log odds from stringency */
final double hetThresholdLogOdds = FastMath.log(FastMath.pow(10, hetCallingStringency) - 1);
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);
final int totalNumberOfSNPs = snpIntervals.size();
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount <= readDepthThreshold) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!isProperBase(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Even though" + " this position is indicated to be a possible heterozygous SNP in the provided SNP interval list," + " no inference can be made. Continuing ...", locus.getPosition(), refBase.toString()));
continue;
}
final Map<Nucleotide, List<BaseQuality>> baseQualities = getPileupBaseQualities(locus);
final Nucleotide altBase = inferAltFromPileup(baseQualities, refBase);
/* calculate Het log odds */
final double hetLogLikelihood = getHetLogLikelihood(baseQualities, refBase, altBase);
final double homLogLikelihood = getHomLogLikelihood(baseQualities, refBase, altBase, DEFAULT_PRIOR_REF_HOM);
final double hetLogOdds = (hetLogLikelihood + FastMath.log(DEFAULT_PRIOR_HET)) - (homLogLikelihood + FastMath.log(1 - DEFAULT_PRIOR_HET));
if (hetLogOdds > hetThresholdLogOdds) {
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), baseQualities.get(refBase).size(), baseQualities.get(altBase).size(), refBase, altBase, totalReadCount, hetLogOdds));
}
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of htsjdk.samtools.reference.ReferenceSequenceFileWalker in project gatk-protected by broadinstitute.
the class HetPulldownCalculator method getHetPulldown.
/**
* For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
* where intervals give positions of likely heterozygous SNP sites.
*
* <p>
* For a normal sample:
* <ul>
* The IntervalList snpIntervals gives common SNP sites in 1-based format.
* </ul>
* <ul>
* The p-value threshold must be specified for a two-sided binomial test,
* which is used to determine SNP sites from snpIntervals that are
* compatible with a heterozygous SNP, given the sample. Only these sites are output.
* </ul>
* </p>
* <p>
* For a tumor sample:
* <ul>
* The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
* This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
* Only these sites are output.
* </ul>
* </p>
* @param bamFile sorted BAM file for sample
* @param snpIntervals IntervalList of SNP sites
* @param sampleType flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
* (determines whether to perform binomial test)
* @param pvalThreshold p-value threshold for two-sided binomial test, used for normal sample
* @param minimumRawReads minimum number of total reads that must be present at a het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final int totalNumberOfSNPs = snpIntervals.size();
final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(false);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
//include N, etc. reads here
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount < minimumRawReads) {
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
final int refReadCount = (int) baseCounts.get(refBase);
final int altReadCount = totalBaseCount - refReadCount;
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of htsjdk.samtools.reference.ReferenceSequenceFileWalker in project gatk by broadinstitute.
the class CollectRrbsMetrics method doWork.
@Override
protected Object doWork() {
if (!METRICS_FILE_PREFIX.endsWith(".")) {
METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + ".";
}
final File SUMMARY_OUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION);
final File DETAILS_OUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION);
final File PLOTS_OUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION);
assertIoFiles(SUMMARY_OUT, DETAILS_OUT, PLOTS_OUT);
final SamReader samReader = SamReaderFactory.makeDefault().open(INPUT);
if (!ASSUME_SORTED && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException("The input file " + INPUT.getAbsolutePath() + " does not appear to be coordinate sorted");
}
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final ProgressLogger progressLogger = new ProgressLogger(logger);
final RrbsMetricsCollector metricsCollector = new RrbsMetricsCollector(METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(), C_QUALITY_THRESHOLD, NEXT_BASE_QUALITY_THRESHOLD, MINIMUM_READ_LENGTH, MAX_MISMATCH_RATE);
for (final SAMRecord samRecord : samReader) {
progressLogger.record(samRecord);
if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) {
final ReferenceSequence referenceSequence = refWalker.get(samRecord.getReferenceIndex());
metricsCollector.acceptRecord(samRecord, referenceSequence);
}
}
metricsCollector.finish();
final MetricsFile<RrbsMetrics, Long> rrbsMetrics = getMetricsFile();
metricsCollector.addAllLevelsToFile(rrbsMetrics);
// Using RrbsMetrics as a way to get both of the metrics objects through the MultiLevelCollector. Once
// we get it out split it apart to the two separate MetricsFiles and write them to file
final MetricsFile<RrbsSummaryMetrics, ?> summaryFile = getMetricsFile();
final MetricsFile<RrbsCpgDetailMetrics, ?> detailsFile = getMetricsFile();
for (final RrbsMetrics rrbsMetric : rrbsMetrics.getMetrics()) {
summaryFile.addMetric(rrbsMetric.getSummaryMetrics());
for (final RrbsCpgDetailMetrics detailMetric : rrbsMetric.getDetailMetrics()) {
detailsFile.addMetric(detailMetric);
}
}
summaryFile.write(SUMMARY_OUT);
detailsFile.write(DETAILS_OUT);
if (PRODUCE_PLOT) {
final RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(R_SCRIPT, CollectRrbsMetrics.class));
executor.addArgs(DETAILS_OUT.getAbsolutePath(), SUMMARY_OUT.getAbsolutePath(), PLOTS_OUT.getAbsolutePath());
executor.exec();
}
CloserUtil.close(samReader);
return null;
}
Aggregations