use of org.broadinstitute.hellbender.exceptions.UserException in project gatk by broadinstitute.
the class SparkUtilsUnitTest method testConvertHeaderlessHadoopBamShardToBam.
@Test
public void testConvertHeaderlessHadoopBamShardToBam() throws Exception {
final File bamShard = new File(publicTestDir + "org/broadinstitute/hellbender/utils/spark/reads_data_source_test1.bam.headerless.part-r-00000");
final File output = createTempFile("testConvertHadoopBamShardToBam", ".bam");
final File headerSource = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
final int expectedReadCount = 11;
boolean shardIsNotValidBam = false;
try (final ReadsDataSource readsSource = new ReadsDataSource(bamShard.toPath())) {
for (final GATKRead read : readsSource) {
}
} catch (SAMFormatException e) {
shardIsNotValidBam = true;
}
Assert.assertTrue(shardIsNotValidBam, "Input shard should not be a valid BAM");
SAMFileHeader header = null;
try (final SamReader headerReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(headerSource)) {
header = headerReader.getFileHeader();
} catch (IOException e) {
throw new UserException("Error reading header from " + headerSource.getAbsolutePath(), e);
}
SparkUtils.convertHeaderlessHadoopBamShardToBam(bamShard, header, output);
int actualCount = 0;
try (final ReadsDataSource readsSource = new ReadsDataSource(output.toPath())) {
for (final GATKRead read : readsSource) {
++actualCount;
}
}
Assert.assertEquals(actualCount, expectedReadCount, "Wrong number of reads in final BAM file");
}
use of org.broadinstitute.hellbender.exceptions.UserException in project gatk-protected by broadinstitute.
the class SparkGenomeReadCounts method collectReads.
private void collectReads() {
if (readArguments.getReadFilesNames().size() != 1) {
throw new UserException("This tool only accepts a single bam/sam/cram as input");
}
final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads());
if (sampleCollection.sampleCount() > 1) {
throw new UserException.BadInput("We do not support bams with more than one sample.");
}
final String sampleName = sampleCollection.sampleIds().get(0);
final String[] commentsForRawCoverage = { "##fileFormat = tsv", "##commandLine = " + getCommandLine(), String.format("##title = Coverage counts in %d base bins for WGS", binsize) };
final ReadFilter filter = makeGenomeReadFilter();
final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary();
logger.info("Starting Spark coverage collection...");
final long coverageCollectionStartTime = System.currentTimeMillis();
final JavaRDD<GATKRead> rawReads = getReads();
final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read));
//Note: using a field inside a closure will pull in the whole enclosing object to serialization
// (which leads to bad performance and can blow up if some objects in the fields are not
// Serializable - closures always use java Serializable and not Kryo)
//Solution here is to use a temp variable for binsize because it's just an int.
final int binsize_tmp = binsize;
final JavaRDD<SimpleInterval> readIntervals = reads.filter(read -> sequenceDictionary.getSequence(read.getContig()) != null).map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp));
final Map<SimpleInterval, Long> byKey = readIntervals.countByValue();
final Set<SimpleInterval> readIntervalKeySet = byKey.keySet();
final long totalReads = byKey.values().stream().mapToLong(v -> v).sum();
final long coverageCollectionEndTime = System.currentTimeMillis();
logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds", readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000));
final String[] commentsForProportionalCoverage = { commentsForRawCoverage[0], commentsForRawCoverage[1], String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)", binsize, totalReads) };
logger.info("Creating full genome bins...");
final long createGenomeBinsStartTime = System.currentTimeMillis();
final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize);
List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins);
TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection);
final long createGenomeBinsEndTime = System.currentTimeMillis();
logger.info(String.format("Finished creating genome bins. Elapse of %d seconds", (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000));
logger.info("Creating missing genome bins...");
final long createMissingGenomeBinsStartTime = System.currentTimeMillis();
logger.info("Creating missing genome bins: Creating a mutable mapping...");
final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>();
byKeyMutable.putAll(byKey);
logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions...");
fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l));
final long createMissingGenomeBinsEndTime = System.currentTimeMillis();
logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds", (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000));
logger.info("Creating final map...");
final long createFinalMapStartTime = System.currentTimeMillis();
final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
byKeySorted.putAll(byKeyMutable);
final long createFinalMapEndTime = System.currentTimeMillis();
logger.info(String.format("Finished creating final map. Elapse of %d seconds", (createFinalMapEndTime - createFinalMapStartTime) / 1000));
logger.info("Creating proportional coverage... ");
final long pCovFileStartTime = System.currentTimeMillis();
final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads));
final long pCovFileEndTime = System.currentTimeMillis();
logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds", (pCovFileEndTime - pCovFileStartTime) / 1000));
logger.info("Writing raw coverage file ...");
final long writingCovFileStartTime = System.currentTimeMillis();
ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage);
final long writingCovFileEndTime = System.currentTimeMillis();
logger.info(String.format("Finished writing coverage file. Elapse of %d seconds", (writingCovFileEndTime - writingCovFileStartTime) / 1000));
logger.info("Writing proportional coverage file ...");
final long writingPCovFileStartTime = System.currentTimeMillis();
ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted, commentsForProportionalCoverage);
final long writingPCovFileEndTime = System.currentTimeMillis();
logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds", (writingPCovFileEndTime - writingPCovFileStartTime) / 1000));
}
use of org.broadinstitute.hellbender.exceptions.UserException in project gatk-protected by broadinstitute.
the class BayesianHetPulldownCalculator method getHetPulldown.
/**
* For a given normal or tumor BAM file, walks through the list of common SNPs,
* {@link BayesianHetPulldownCalculator#snpIntervals}), detects heterozygous sites, and returns
* a {@link Pulldown} containing detailed information on the called heterozygous SNP sites.
*
* The {@code hetCallingStrigency} parameters sets the threshold posterior for calling a Het SNP site:
*
* hetPosteriorThreshold = 1 - 10^{-hetCallingStringency}
* hetThresholdLogOdds = log(hetPosteriorThreshold/(1-hetPosteriorThreshold))
* = log(10^{hetCallingStringency} - 1)
*
* (see CNV-methods.pdf for details)
*
* @param bamFile sorted BAM file for sample
* @param hetCallingStringency strigency for calling a Het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
public Pulldown getHetPulldown(final File bamFile, final double hetCallingStringency) {
/* log odds from stringency */
final double hetThresholdLogOdds = FastMath.log(FastMath.pow(10, hetCallingStringency) - 1);
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);
final int totalNumberOfSNPs = snpIntervals.size();
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount <= readDepthThreshold) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!isProperBase(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Even though" + " this position is indicated to be a possible heterozygous SNP in the provided SNP interval list," + " no inference can be made. Continuing ...", locus.getPosition(), refBase.toString()));
continue;
}
final Map<Nucleotide, List<BaseQuality>> baseQualities = getPileupBaseQualities(locus);
final Nucleotide altBase = inferAltFromPileup(baseQualities, refBase);
/* calculate Het log odds */
final double hetLogLikelihood = getHetLogLikelihood(baseQualities, refBase, altBase);
final double homLogLikelihood = getHomLogLikelihood(baseQualities, refBase, altBase, DEFAULT_PRIOR_REF_HOM);
final double hetLogOdds = (hetLogLikelihood + FastMath.log(DEFAULT_PRIOR_HET)) - (homLogLikelihood + FastMath.log(1 - DEFAULT_PRIOR_HET));
if (hetLogOdds > hetThresholdLogOdds) {
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), baseQualities.get(refBase).size(), baseQualities.get(altBase).size(), refBase, altBase, totalReadCount, hetLogOdds));
}
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of org.broadinstitute.hellbender.exceptions.UserException in project gatk-protected by broadinstitute.
the class HetPulldownCalculator method getHetPulldown.
/**
* For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
* where intervals give positions of likely heterozygous SNP sites.
*
* <p>
* For a normal sample:
* <ul>
* The IntervalList snpIntervals gives common SNP sites in 1-based format.
* </ul>
* <ul>
* The p-value threshold must be specified for a two-sided binomial test,
* which is used to determine SNP sites from snpIntervals that are
* compatible with a heterozygous SNP, given the sample. Only these sites are output.
* </ul>
* </p>
* <p>
* For a tumor sample:
* <ul>
* The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
* This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
* Only these sites are output.
* </ul>
* </p>
* @param bamFile sorted BAM file for sample
* @param snpIntervals IntervalList of SNP sites
* @param sampleType flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
* (determines whether to perform binomial test)
* @param pvalThreshold p-value threshold for two-sided binomial test, used for normal sample
* @param minimumRawReads minimum number of total reads that must be present at a het site
* @return Pulldown of heterozygous SNP sites in 1-based format
*/
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
final int totalNumberOfSNPs = snpIntervals.size();
final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(false);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
int locusCount = 0;
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " covered sites.");
}
locusCount++;
//include N, etc. reads here
final int totalReadCount = locus.getRecordAndOffsets().size();
if (totalReadCount < minimumRawReads) {
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
continue;
}
final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
final int refReadCount = (int) baseCounts.get(refBase);
final int altReadCount = totalBaseCount - refReadCount;
hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
}
logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
return hetPulldown;
} catch (final IOException | SAMFormatException e) {
throw new UserException(e.getMessage());
}
}
use of org.broadinstitute.hellbender.exceptions.UserException in project gatk by broadinstitute.
the class ConvertHeaderlessHadoopBamShardToBam method doWork.
@Override
protected Object doWork() {
SAMFileHeader header = null;
try (final SamReader headerReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamWithHeader)) {
header = headerReader.getFileHeader();
} catch (IOException e) {
throw new UserException("Error reading header from " + bamWithHeader.getAbsolutePath(), e);
}
SparkUtils.convertHeaderlessHadoopBamShardToBam(bamShard, header, outputBam);
return null;
}
Aggregations