Example 11 with AllelicCount

     * Write segment file based on maximum-likelihood estimates of the minor allele fraction at SNP sites,
     * assuming the specified allelic bias.  These estimates are converted to target coverages,
     * which are written to a temporary file and then passed to {@link RCBSSegmenter}.
     * @param snps                  TargetCollection of allelic counts at SNP sites
     * @param sampleName            sample name
     * @param outputFile            segment file to write to and return
     * @param allelicBias           allelic bias to use in estimate of minor allele fraction
public static void writeSegmentFile(final TargetCollection<AllelicCount> snps, final String sampleName, final File outputFile, final double allelicBias) {
    Utils.validateArg(snps.totalSize() > 0, "Must have a positive number of SNPs to perform SNP segmentation.");
    try {
        final File targetsFromSNPCountsFile = File.createTempFile("targets-from-snps", ".tsv");
        final List<Target> targets = snps.targets().stream().map(ac -> new Target(name(ac), ac.getInterval())).collect(Collectors.toList());
        final RealMatrix minorAlleleFractions = new Array2DRowRealMatrix(snps.targetCount(), 1);
        minorAlleleFractions.setColumn(0, snps.targets().stream().mapToDouble(ac -> ac.estimateMinorAlleleFraction(allelicBias)).toArray());
        ReadCountCollectionUtils.write(targetsFromSNPCountsFile, new ReadCountCollection(targets, Collections.singletonList(sampleName), minorAlleleFractions));
        //segment SNPs based on observed log_2 minor allele fraction (log_2 is applied in CBS.R)
        RCBSSegmenter.writeSegmentFile(sampleName, targetsFromSNPCountsFile.getAbsolutePath(), outputFile.getAbsolutePath(), false);
    } catch (final IOException e) {
        throw new UserException.CouldNotCreateOutputFile("Could not create temporary output file during " + "SNP segmentation.", e);
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) RCBSSegmenter(org.broadinstitute.hellbender.utils.segmenter.RCBSSegmenter) AllelicCount( Utils(org.broadinstitute.hellbender.utils.Utils) RealMatrix(org.apache.commons.math3.linear.RealMatrix) IOException( Collections(java.util.Collections) Collectors( File( Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOException( UserException(org.broadinstitute.hellbender.exceptions.UserException) File(

Example 12 with AllelicCount

use of in project gatk by broadinstitute.

the class AlleleFractionLikelihoodsUnitTest method testHetLogLikelihoodMinorFractionNearZero.

//if f is very close to 0 we have an analytic result for comparison
public void testHetLogLikelihoodMinorFractionNearZero() {
    //pi is just a prefactor so we don't need to test it thoroughly here
    final double pi = 0.01;
    for (final double f : Arrays.asList(1e-6, 1e-7, 1e-8)) {
        for (final double mean : Arrays.asList(0.9, 1.0, 1.1)) {
            for (final double variance : Arrays.asList(0.01, 0.005, 0.001)) {
                final double alpha = mean * mean / variance;
                final double beta = mean / variance;
                final AlleleFractionGlobalParameters parameters = new AlleleFractionGlobalParameters(mean, variance, pi);
                for (final int a : Arrays.asList(1, 2, 3)) {
                    //alt count
                    for (final int r : Arrays.asList(50, 100, 200)) {
                        //ref count
                        final AllelicCount count = new AllelicCount(DUMMY, r, a);
                        final double actual = AlleleFractionLikelihoods.hetLogLikelihood(parameters, f, count, AlleleFractionIndicator.ALT_MINOR);
                        final double expected = a * log(beta) + Gamma.logGamma(alpha - a) - Gamma.logGamma(alpha) + log((1 - pi) / 2) + a * log(f / (1 - f));
                        Assert.assertEquals(actual, expected, 1e-3);
Also used : AllelicCount( Test(org.testng.annotations.Test)

Example 13 with AllelicCount

use of in project gatk by broadinstitute.

the class AlleleFractionLikelihoodsUnitTest method testHetLogLikelihoodTightDistribution.

//if variance is tiny we can approximate lambda ~ mu in the tricky part of the integral to get an analytic result
public void testHetLogLikelihoodTightDistribution() {
    //pi is just a prefactor so we don't need to test it thoroughly here
    final double pi = 0.01;
    for (final double f : Arrays.asList(0.1, 0.2, 0.3)) {
        for (final double mean : Arrays.asList(0.9, 1.0, 1.1)) {
            for (final double variance : Arrays.asList(1e-6, 1e-7, 1e-8)) {
                final AlleleFractionGlobalParameters parameters = new AlleleFractionGlobalParameters(mean, variance, pi);
                for (final int a : Arrays.asList(1, 10, 20)) {
                    //alt count
                    for (final int r : Arrays.asList(1, 10, 20)) {
                        //ref count
                        final AllelicCount count = new AllelicCount(DUMMY, r, a);
                        final double actual = AlleleFractionLikelihoods.hetLogLikelihood(parameters, f, count, AlleleFractionIndicator.ALT_MINOR);
                        final double expected = log((1 - pi) / 2) + a * log(f) + r * log(1 - f) + r * log(mean) - (a + r) * log(f + (1 - f) * mean);
                        Assert.assertEquals(actual, expected, 1e-3);
Also used : AllelicCount( Test(org.testng.annotations.Test)

Example 14 with AllelicCount

use of in project gatk-protected by broadinstitute.

the class BayesianHetPulldownCalculator method getHetPulldown.

     * For a given normal or tumor BAM file, walks through the list of common SNPs,
     * {@link BayesianHetPulldownCalculator#snpIntervals}), detects heterozygous sites, and returns
     * a {@link Pulldown} containing detailed information on the called heterozygous SNP sites.
     * The {@code hetCallingStrigency} parameters sets the threshold posterior for calling a Het SNP site:
     *      hetPosteriorThreshold = 1 - 10^{-hetCallingStringency}
     *      hetThresholdLogOdds = log(hetPosteriorThreshold/(1-hetPosteriorThreshold))
     *                          = log(10^{hetCallingStringency} - 1)
     * (see CNV-methods.pdf for details)
     * @param bamFile sorted BAM file for sample
     * @param hetCallingStringency strigency for calling a Het site
     * @return Pulldown of heterozygous SNP sites in 1-based format
public Pulldown getHetPulldown(final File bamFile, final double hetCallingStringency) {
    /* log odds from stringency */
    final double hetThresholdLogOdds = FastMath.log(FastMath.pow(10, hetCallingStringency) - 1);
    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
        final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
        final SamLocusIterator locusIterator = getSamLocusIteratorWithDefaultFilters(bamReader);
        final int totalNumberOfSNPs = snpIntervals.size();"Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
      "Examined " + locusCount + " covered sites.");
            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount <= readDepthThreshold) {
            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            if (!isProperBase(refBase)) {
                logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Even though" + " this position is indicated to be a possible heterozygous SNP in the provided SNP interval list," + " no inference can be made. Continuing ...", locus.getPosition(), refBase.toString()));
            final Map<Nucleotide, List<BaseQuality>> baseQualities = getPileupBaseQualities(locus);
            final Nucleotide altBase = inferAltFromPileup(baseQualities, refBase);
            /* calculate Het log odds */
            final double hetLogLikelihood = getHetLogLikelihood(baseQualities, refBase, altBase);
            final double homLogLikelihood = getHomLogLikelihood(baseQualities, refBase, altBase, DEFAULT_PRIOR_REF_HOM);
            final double hetLogOdds = (hetLogLikelihood + FastMath.log(DEFAULT_PRIOR_HET)) - (homLogLikelihood + FastMath.log(1 - DEFAULT_PRIOR_HET));
            if (hetLogOdds > hetThresholdLogOdds) {
                hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), baseQualities.get(refBase).size(), baseQualities.get(altBase).size(), refBase, altBase, totalReadCount, hetLogOdds));
        } + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
        return hetPulldown;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
Also used : IOException( SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) AllelicCount(

Example 15 with AllelicCount

use of in project gatk-protected by broadinstitute.

the class HetPulldownCalculator method getHetPulldown.

     * For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
     * where intervals give positions of likely heterozygous SNP sites.
     * <p>
     *     For a normal sample:
     *     <ul>
     *         The IntervalList snpIntervals gives common SNP sites in 1-based format.
     *     </ul>
     *     <ul>
     *         The p-value threshold must be specified for a two-sided binomial test,
     *         which is used to determine SNP sites from snpIntervals that are
     *         compatible with a heterozygous SNP, given the sample.  Only these sites are output.
     *     </ul>
     * </p>
     * <p>
     *     For a tumor sample:
     *     <ul>
     *         The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
     *         This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
     *         Only these sites are output.
     *     </ul>
     * </p>
     * @param bamFile           sorted BAM file for sample
     * @param snpIntervals      IntervalList of SNP sites
     * @param sampleType        flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
     *                          (determines whether to perform binomial test)
     * @param pvalThreshold     p-value threshold for two-sided binomial test, used for normal sample
     * @param minimumRawReads   minimum number of total reads that must be present at a het site
     * @return                  Pulldown of heterozygous SNP sites in 1-based format
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
        final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
        final int totalNumberOfSNPs = snpIntervals.size();
        final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
        locusIterator.setQualityScoreCutoff(minBaseQuality);"Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
      "Examined " + locusCount + " covered sites.");
            //include N, etc. reads here
            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount < minimumRawReads) {
            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = -> (int) baseCounts.get(b)).sum();
            if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            final int refReadCount = (int) baseCounts.get(refBase);
            final int altReadCount = totalBaseCount - refReadCount;
            hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
        } + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
        return hetPulldown;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
Also used : Arrays(java.util.Arrays) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) AlternativeHypothesis(org.apache.commons.math3.stat.inference.AlternativeHypothesis) AllelicCount( IOException( Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File( BinomialTest(org.apache.commons.math3.stat.inference.BinomialTest) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) VisibleForTesting( htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) IOException( SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) AllelicCount(


