Search in sources :

Example 51 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class IntegerCopyNumberTransitionMatrixUnitTest method testUnnormalizedProbability.

@Test
public void testUnnormalizedProbability() {
    /* it should normalize unnormalized transition matrices and give a warning */
    final IntegerCopyNumberTransitionMatrix transitionMatrix = new IntegerCopyNumberTransitionMatrix(new Array2DRowRealMatrix(new double[][] { { 1, 2, 3 }, { 4, 5, 6 }, { 7, 8, 9 } }), 0);
    for (int i = 0; i < 3; i++) {
        final double[] col = transitionMatrix.getTransitionMatrix().getColumn(i);
        Assert.assertEquals(Arrays.stream(col).sum(), 1.0, 1e-12);
    }
}
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 52 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class IntegerCopyNumberTransitionProbabilityCacheUnitTest method testBasicSoundness.

@Test
public void testBasicSoundness() {
    for (final RealMatrix transitionMatrix : TRANSITION_MATRICES) {
        final IntegerCopyNumberTransitionProbabilityCache cache = new IntegerCopyNumberTransitionProbabilityCache(new IntegerCopyNumberTransitionMatrix(transitionMatrix, 0));
        for (final int dist : DISTANCES) {
            final RealMatrix transitionMatrixExponentiated = cache.getTransitionProbabilityMatrix(dist);
            /* assert positivity */
            Assert.assertTrue(Arrays.stream(transitionMatrixExponentiated.getData()).flatMapToDouble(Arrays::stream).allMatch(d -> d >= 0));
            /* assert conservation of probability */
            for (int c = 0; c < transitionMatrix.getColumnDimension(); c++) {
                Assert.assertEquals(Arrays.stream(transitionMatrixExponentiated.getColumn(c)).sum(), 1.0, EPSILON);
            }
            /* assert correctness, T(2*d) = T(d)*T(d) */
            assertEqualMatrices(cache.getTransitionProbabilityMatrix(2 * dist), transitionMatrixExponentiated.multiply(transitionMatrixExponentiated));
        }
        /* assert loss of initial state over long distances, i.e. all columns must be equal */
        final RealMatrix longRangeTransitionMatrix = cache.getTransitionProbabilityMatrix(Integer.MAX_VALUE);
        final double[] firstColumn = longRangeTransitionMatrix.getColumn(0);
        final RealMatrix syntheticLongRangeTransitionMatrix = new Array2DRowRealMatrix(firstColumn.length, firstColumn.length);
        for (int i = 0; i < firstColumn.length; i++) {
            syntheticLongRangeTransitionMatrix.setColumn(i, firstColumn);
        }
        assertEqualMatrices(longRangeTransitionMatrix, syntheticLongRangeTransitionMatrix);
        final double[] stationary = cache.getStationaryProbabilityVector().toArray();
        ArrayAsserts.assertArrayEquals(stationary, firstColumn, EPSILON);
    }
}
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) Arrays(java.util.Arrays) Assert(org.testng.Assert) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Test(org.testng.annotations.Test) ArrayAsserts(org.testng.internal.junit.ArrayAsserts) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) Arrays(java.util.Arrays) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 53 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class IntegerCopyNumberTransitionProbabilityCacheUnitTest method testStationaryProbabilities.

@Test
public void testStationaryProbabilities() {
    for (final int padding : PADDINGS) {
        for (final RealMatrix transitionMatrix : TRANSITION_MATRICES) {
            final IntegerCopyNumberTransitionProbabilityCache cache = new IntegerCopyNumberTransitionProbabilityCache(new IntegerCopyNumberTransitionMatrix(transitionMatrix, padding));
            final double[] stationary = cache.getStationaryProbabilityVector().toArray();
            final double[] stationaryFromMultiplication = cache.getTransitionProbabilityMatrix(Integer.MAX_VALUE).getColumn(0);
            ArrayAsserts.assertArrayEquals(stationary, stationaryFromMultiplication, EPSILON);
            Assert.assertEquals(Arrays.stream(stationary).sum(), 1.0, EPSILON);
        }
    }
}
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 54 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class HetPulldownCalculator method getHetPulldown.

/**
     * For a normal or tumor sample, returns a data structure giving (intervals, reference counts, alternate counts),
     * where intervals give positions of likely heterozygous SNP sites.
     *
     * <p>
     *     For a normal sample:
     *     <ul>
     *         The IntervalList snpIntervals gives common SNP sites in 1-based format.
     *     </ul>
     *     <ul>
     *         The p-value threshold must be specified for a two-sided binomial test,
     *         which is used to determine SNP sites from snpIntervals that are
     *         compatible with a heterozygous SNP, given the sample.  Only these sites are output.
     *     </ul>
     * </p>
     * <p>
     *     For a tumor sample:
     *     <ul>
     *         The IntervalList snpIntervals gives heterozygous SNP sites likely to be present in the normal sample.
     *         This should be from {@link HetPulldownCalculator#getNormal} in 1-based format.
     *         Only these sites are output.
     *     </ul>
     * </p>
     * @param bamFile           sorted BAM file for sample
     * @param snpIntervals      IntervalList of SNP sites
     * @param sampleType        flag indicating type of sample (SampleType.NORMAL or SampleType.TUMOR)
     *                          (determines whether to perform binomial test)
     * @param pvalThreshold     p-value threshold for two-sided binomial test, used for normal sample
     * @param minimumRawReads   minimum number of total reads that must be present at a het site
     * @return                  Pulldown of heterozygous SNP sites in 1-based format
     */
private Pulldown getHetPulldown(final File bamFile, final IntervalList snpIntervals, final SampleType sampleType, final double pvalThreshold, final int minimumRawReads) {
    try (final SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(refFile).open(bamFile);
        final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(refFile)) {
        if (bamReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }
        final Pulldown hetPulldown = new Pulldown(bamReader.getFileHeader());
        final int totalNumberOfSNPs = snpIntervals.size();
        final SamLocusIterator locusIterator = new SamLocusIterator(bamReader, snpIntervals, totalNumberOfSNPs < MAX_INTERVALS_FOR_INDEX);
        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
        locusIterator.setSamFilters(samFilters);
        locusIterator.setEmitUncoveredLoci(false);
        locusIterator.setIncludeNonPfReads(false);
        locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
        locusIterator.setQualityScoreCutoff(minBaseQuality);
        logger.info("Examining " + totalNumberOfSNPs + " sites in total...");
        int locusCount = 0;
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " covered sites.");
            }
            locusCount++;
            //include N, etc. reads here
            final int totalReadCount = locus.getRecordAndOffsets().size();
            if (totalReadCount < minimumRawReads) {
                continue;
            }
            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = Arrays.stream(BASES).mapToInt(b -> (int) baseCounts.get(b)).sum();
            if (sampleType == SampleType.NORMAL && !isPileupHetCompatible(baseCounts, totalBaseCount, pvalThreshold)) {
                continue;
            }
            final Nucleotide refBase = Nucleotide.valueOf(refWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            final int refReadCount = (int) baseCounts.get(refBase);
            final int altReadCount = totalBaseCount - refReadCount;
            hetPulldown.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount));
        }
        logger.info(locusCount + " covered sites out of " + totalNumberOfSNPs + " total sites were examined.");
        return hetPulldown;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException(e.getMessage());
    }
}
Also used : Arrays(java.util.Arrays) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) AlternativeHypothesis(org.apache.commons.math3.stat.inference.AlternativeHypothesis) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount) IOException(java.io.IOException) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) BinomialTest(org.apache.commons.math3.stat.inference.BinomialTest) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) VisibleForTesting(com.google.common.annotations.VisibleForTesting) htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) IOException(java.io.IOException) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)

Example 55 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class TargetCoverageSexGenotypeCalculator method calculateSexGenotypeData.

/**
     * Calculates the likelihood of different sex genotypes for a given sample index
     *
     * @param sampleIndex sample index
     * @return an instance of {@link SexGenotypeData}
     */
private SexGenotypeData calculateSexGenotypeData(final int sampleIndex) {
    final int[] allosomalReadCounts = Arrays.stream(processedReadCounts.getColumnOnSpecifiedTargets(sampleIndex, allosomalTargetList, true)).mapToInt(n -> (int) n).toArray();
    final double readDepth = getSampleReadDepthFromAutosomalTargets(sampleIndex);
    logger.debug("Read depth for " + processedReadCounts.columnNames().get(sampleIndex) + ": " + readDepth);
    final List<Double> logLikelihoods = new ArrayList<>();
    final List<String> sexGenotypesList = new ArrayList<>();
    final int numAllosomalTargets = allosomalTargetList.size();
    for (final String genotypeName : sexGenotypeIdentifiers) {
        /* calculate log likelihood */
        final int[] currentAllosomalTargetPloidies = allosomalTargetPloidies.get(genotypeName);
        final double[] poissonParameters = IntStream.range(0, numAllosomalTargets).mapToDouble(i -> readDepth * (currentAllosomalTargetPloidies[i] > 0 ? currentAllosomalTargetPloidies[i] : baselineMappingErrorProbability)).toArray();
        final double currentLogLikelihood = IntStream.range(0, numAllosomalTargets).mapToDouble(i -> {
            final PoissonDistribution pois = new PoissonDistribution(poissonParameters[i]);
            return pois.logProbability(allosomalReadCounts[i]);
        }).sum();
        sexGenotypesList.add(genotypeName);
        logLikelihoods.add(currentLogLikelihood);
    }
    /* infer the most likely sex genotype */
    final Integer[] indices = new Integer[sexGenotypesList.size()];
    IntStream.range(0, sexGenotypesList.size()).forEach(i -> indices[i] = i);
    Arrays.sort(indices, (li, ri) -> Double.compare(logLikelihoods.get(ri), logLikelihoods.get(li)));
    return new SexGenotypeData(processedReadCounts.columnNames().get(sampleIndex), sexGenotypesList.get(indices[0]), sexGenotypesList, logLikelihoods.stream().mapToDouble(d -> d).toArray());
}
Also used : IntStream(java.util.stream.IntStream) java.util(java.util) Collectors(java.util.stream.Collectors) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) Sets(com.google.cloud.dataflow.sdk.repackaged.com.google.common.collect.Sets) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) UserException(org.broadinstitute.hellbender.exceptions.UserException) Target(org.broadinstitute.hellbender.tools.exome.Target) Median(org.apache.commons.math3.stat.descriptive.rank.Median) ReadCountCollectionUtils(org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils) LogManager(org.apache.logging.log4j.LogManager) Nonnull(javax.annotation.Nonnull) PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution)

Aggregations

RealMatrix (org.apache.commons.math3.linear.RealMatrix)22 Collectors (java.util.stream.Collectors)15 java.util (java.util)12 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)12 List (java.util.List)11 IntStream (java.util.stream.IntStream)11 Logger (org.apache.logging.log4j.Logger)11 ArrayList (java.util.ArrayList)9 LogManager (org.apache.logging.log4j.LogManager)9 IOException (java.io.IOException)8 Map (java.util.Map)8 TooManyEvaluationsException (org.apache.commons.math3.exception.TooManyEvaluationsException)8 UserException (org.broadinstitute.hellbender.exceptions.UserException)8 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)8 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)8 Test (org.testng.annotations.Test)8 File (java.io.File)7 VisibleForTesting (com.google.common.annotations.VisibleForTesting)6 Arrays (java.util.Arrays)6 Nonnull (javax.annotation.Nonnull)6