use of htsjdk.samtools.util.IntervalList in project gatk-protected by broadinstitute.
the class SplitIntervals method onTraversalStart.
@Override
public void onTraversalStart() {
ParamUtils.isPositive(scatterCount, "scatter count must be > 0.");
if (!outputDir.exists() && !outputDir.mkdir()) {
throw new RuntimeIOException("Unable to create directory: " + outputDir.getAbsolutePath());
}
// in general dictionary will be from the reference, but using -I reads.bam or -F variants.vcf
// to use the sequence dict from a bam or vcf is also supported
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
final List<SimpleInterval> intervals = hasIntervals() ? intervalArgumentCollection.getIntervals(sequenceDictionary) : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
final IntervalList intervalList = new IntervalList(sequenceDictionary);
intervals.stream().map(si -> new Interval(si.getContig(), si.getStart(), si.getEnd())).forEach(intervalList::add);
final IntervalListScatterer scatterer = new IntervalListScatterer(subdivisionMode);
final List<IntervalList> scattered = scatterer.scatter(intervalList, scatterCount, false);
final DecimalFormat formatter = new DecimalFormat("0000");
IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals")));
}
use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class IntervalUtils method scatterContigIntervals.
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* @param locs The genome locs to split.
* @param scatterParts The output interval lists to write to.
*/
public static void scatterContigIntervals(final SAMFileHeader fileHeader, final List<GenomeLoc> locs, final List<File> scatterParts) {
// Contract: must divide locs up so that each of scatterParts gets a sublist such that:
// (a) all locs concerning a particular contig go to the same part
// (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs)
// Locs are already sorted.
long totalBases = 0;
for (final GenomeLoc loc : locs) {
totalBases += loc.size();
}
final long idealBasesPerPart = totalBases / scatterParts.size();
if (idealBasesPerPart == 0) {
throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size()));
}
// Find the indices in locs where we switch from one contig to the next.
final ArrayList<Integer> contigStartLocs = new ArrayList<>();
String prevContig = null;
for (int i = 0; i < locs.size(); ++i) {
final GenomeLoc loc = locs.get(i);
if (prevContig == null || !loc.getContig().equals(prevContig)) {
contigStartLocs.add(i);
}
prevContig = loc.getContig();
}
if (contigStartLocs.size() < scatterParts.size()) {
throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size()));
}
long thisPartBases = 0;
int partIdx = 0;
IntervalList outList = new IntervalList(fileHeader);
for (int i = 0; i < locs.size(); ++i) {
final GenomeLoc loc = locs.get(i);
thisPartBases += loc.getStop() - loc.getStart();
outList.add(toInterval(loc, i));
boolean partMustStop = false;
if (partIdx < (scatterParts.size() - 1)) {
// If there are n contigs and n parts remaining then we must split here,
// otherwise we will run out of contigs.
final int nextPart = partIdx + 1;
final int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size()));
if (i + 1 == nextPartMustStartBy) {
partMustStop = true;
}
} else if (i == locs.size() - 1) {
// We're done! Write the last scatter file.
partMustStop = true;
}
if (partMustStop || thisPartBases > idealBasesPerPart) {
// Ideally we would split here. However, we must make sure to do so
// on a contig boundary. Test always passes with partMustStop == true
// since that indicates we're at a contig boundary.
GenomeLoc nextLoc = null;
if ((i + 1) < locs.size()) {
nextLoc = locs.get(i + 1);
}
if (nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) {
// Write out this part:
outList.write(scatterParts.get(partIdx));
// Reset. If this part ran long, leave the excess in thisPartBases
// and the next will be a little shorter to compensate.
outList = new IntervalList(fileHeader);
thisPartBases -= idealBasesPerPart;
++partIdx;
}
}
}
}
use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class IntervalUtils method intervalFileToList.
/**
* Read a file of genome locations to process. The file may be in Picard
* or GATK interval format.
*
* @param glParser GenomeLocParser
* @param fileName interval file
* @return List<GenomeLoc> List of Genome Locs that have been parsed from file
*/
public static List<GenomeLoc> intervalFileToList(final GenomeLocParser glParser, final String fileName) {
Utils.nonNull(glParser, "glParser is null");
Utils.nonNull(fileName, "file name is null");
final File inputFile = new File(fileName);
final List<GenomeLoc> ret = new ArrayList<>();
/**
* First try to read the file as a Picard interval file since that's well structured --
* we'll fail quickly if it's not a valid file.
*/
boolean isPicardInterval = false;
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
final IntervalList il = IntervalList.fromFile(inputFile);
isPicardInterval = true;
for (final Interval interval : il.getIntervals()) {
// https://github.com/broadinstitute/gatk/issues/2089
if (interval.getStart() - interval.getEnd() == 1) {
logger.warn("Ignoring possibly incorrectly converted length 1 interval : " + interval);
} else if (glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
} else {
throw new UserException(inputFile.getAbsolutePath() + " has an invalid interval : " + interval);
}
}
}// if that didn't work, try parsing file as a GATK interval file
catch (final Exception e) {
if (// definitely a picard file, but we failed to parse
isPicardInterval) {
throw new UserException.CouldNotReadInputFile(inputFile, e);
} else {
try (XReadLines reader = new XReadLines(new File(fileName))) {
for (final String line : reader) {
if (!line.trim().isEmpty()) {
ret.add(glParser.parseGenomeLoc(line));
}
}
} catch (final IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
}
if (ret.isEmpty()) {
throw new UserException.MalformedFile(new File(fileName), "It contains no intervals.");
}
return ret;
}
use of htsjdk.samtools.util.IntervalList in project gatk-protected by broadinstitute.
the class CollectAllelicCounts method doWork.
@Override
protected Object doWork() {
validateArguments();
final File referenceFile = referenceArguments.getReferenceFile();
final IntervalList siteIntervals = IntervalList.fromFile(inputSiteIntervalsFile);
final AllelicCountCollector allelicCountCollector = new AllelicCountCollector(referenceFile, readValidationStringency);
logger.info("Collecting allelic counts...");
final AllelicCountCollection allelicCounts = allelicCountCollector.collect(inputBAMFile, siteIntervals, minimumMappingQuality, minimumBaseQuality);
allelicCounts.write(outputAllelicCountsFile);
logger.info("Allelic counts written to " + outputAllelicCountsFile.toString());
return "SUCCESS";
}
use of htsjdk.samtools.util.IntervalList in project gatk by broadinstitute.
the class AllelicCountCollector method collect.
/**
* Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list)
* in a sorted BAM file. Reads and bases below the specified mapping quality and base quality, respectively,
* are filtered out of the pileup. The alt count is defined as the total count minus the ref count, and the
* alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the
* bases in {@link AllelicCountCollector#BASES}.
* @param bamFile sorted BAM file
* @param siteIntervals interval list of sites
* @param minMappingQuality minimum mapping quality required for reads to be included in pileup
* @param minBaseQuality minimum base quality required for bases to be included in pileup
* @return AllelicCountCollection of ref/alt counts at sites in BAM file
*/
public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) {
try (final SamReader reader = readerFactory.open(bamFile)) {
ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative.");
ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative.");
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final int numberOfSites = siteIntervals.size();
final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX;
final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(true);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + numberOfSites + " sites in total...");
int locusCount = 0;
final AllelicCountCollection counts = new AllelicCountCollection();
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " sites.");
}
locusCount++;
final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!BASES.contains(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString()));
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum();
final int refReadCount = (int) baseCounts.get(refBase);
//we take alt = total - ref instead of the actual alt count
final int altReadCount = totalBaseCount - refReadCount;
final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase);
counts.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase));
}
logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined.");
return counts;
} catch (final IOException | SAMFormatException e) {
throw new UserException("Unable to collect allelic counts from " + bamFile);
}
}
Aggregations