Search in sources :

Example 46 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.

the class IntegerCopyNumberTransitionProbabilityCacheUnitTest method testStationaryProbabilities.

@Test
public void testStationaryProbabilities() {
    for (final int padding : PADDINGS) {
        for (final RealMatrix transitionMatrix : TRANSITION_MATRICES) {
            final IntegerCopyNumberTransitionProbabilityCache cache = new IntegerCopyNumberTransitionProbabilityCache(new IntegerCopyNumberTransitionMatrix(transitionMatrix, padding));
            final double[] stationary = cache.getStationaryProbabilityVector().toArray();
            final double[] stationaryFromMultiplication = cache.getTransitionProbabilityMatrix(Integer.MAX_VALUE).getColumn(0);
            ArrayAsserts.assertArrayEquals(stationary, stationaryFromMultiplication, EPSILON);
            Assert.assertEquals(Arrays.stream(stationary).sum(), 1.0, EPSILON);
        }
    }
}
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 47 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.

the class TestHMM method fromPhredProbabilities.

static TestHMM fromPhredProbabilities(final double... phred) {
    final double[] priors = Arrays.copyOfRange(phred, 0, State.values().length);
    final double[][] transition = reshape(Arrays.copyOfRange(phred, State.values().length, State.values().length * (State.values().length + 1)), State.values().length);
    final double[][] emission = reshape(Arrays.copyOfRange(phred, (State.values().length + 1) * State.values().length, phred.length), State.values().length);
    final DoubleUnaryOperator phredToLog = d -> d * -.1 * LN_OF_10;
    final double[] logPriorsRaw = DoubleStream.of(priors).map(phredToLog).toArray();
    final double logPriorsSum = GATKProtectedMathUtils.logSumExp(logPriorsRaw);
    final double[] logPriors = DoubleStream.of(logPriorsRaw).map(d -> d - logPriorsSum).toArray();
    final double[][] logEmissionProbs = Stream.of(emission).map(x -> {
        final double[] result = DoubleStream.of(x).map(phredToLog).toArray();
        final double sum = GATKProtectedMathUtils.logSumExp(result);
        return DoubleStream.of(result).map(d -> d - sum).toArray();
    }).toArray(double[][]::new);
    final double[][] logTransitionProbs = Stream.of(transition).map(x -> {
        final double[] result = DoubleStream.of(x).map(phredToLog).toArray();
        final double sum = GATKProtectedMathUtils.logSumExp(result);
        return DoubleStream.of(result).map(d -> d - sum).toArray();
    }).toArray(double[][]::new);
    return new TestHMM(logPriors, logEmissionProbs, logTransitionProbs);
}
Also used : java.util(java.util) GATKProtectedMathUtils(org.broadinstitute.hellbender.utils.GATKProtectedMathUtils) Stream(java.util.stream.Stream) Pair(org.apache.commons.math3.util.Pair) Utils(org.broadinstitute.hellbender.utils.Utils) ArrayUtils(org.apache.commons.lang3.ArrayUtils) Collectors(java.util.stream.Collectors) DoubleUnaryOperator(java.util.function.DoubleUnaryOperator) DoubleStream(java.util.stream.DoubleStream) DoubleUnaryOperator(java.util.function.DoubleUnaryOperator)

Example 48 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.

/**
     * Impute zero counts to the median of non-zero values in the enclosing target row.
     *
     * <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
     *
     * @param readCounts the input and output read-count matrix.
     */
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final int targetCount = counts.getRowDimension();
    final Median medianCalculator = new Median();
    int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
    // Get the number of zeroes contained in the counts.
    final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
    // Get the median of each row, not including any entries that are zero.
    final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
    // Change any zeros in the counts to the median for the row.
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(final int row, final int column, final double value) {
            return value != 0 ? value : medians[row];
        }
    });
    if (totalZeroCounts > 0) {
        final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
        logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
    } else {
        logger.info("No count is 0.0 thus no count needed to be imputed.");
    }
}
Also used : IntStream(java.util.stream.IntStream) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) java.util(java.util) TableReader(org.broadinstitute.hellbender.utils.tsv.TableReader) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) StringUtils(org.apache.commons.lang3.StringUtils) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) SampleNameFinder(org.broadinstitute.hellbender.tools.exome.samplenamefinder.SampleNameFinder) DataLine(org.broadinstitute.hellbender.utils.tsv.DataLine) Median(org.apache.commons.math3.stat.descriptive.rank.Median) TableColumnCollection(org.broadinstitute.hellbender.utils.tsv.TableColumnCollection) Nonnull(javax.annotation.Nonnull) Locatable(htsjdk.samtools.util.Locatable) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) Predicate(java.util.function.Predicate) FastMath(org.apache.commons.math3.util.FastMath) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) DoubleStream(java.util.stream.DoubleStream) TableWriter(org.broadinstitute.hellbender.utils.tsv.TableWriter) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) SetUniqueList(org.apache.commons.collections4.list.SetUniqueList) Doubles(com.google.common.primitives.Doubles) Utils(org.broadinstitute.hellbender.utils.Utils) RealMatrix(org.apache.commons.math3.linear.RealMatrix) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) Median(org.apache.commons.math3.stat.descriptive.rank.Median)

Example 49 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class CalculateContamination method findConfidentHomAltSites.

private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
    if (sites.isEmpty()) {
        return new ArrayList<>();
    }
    final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
    final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
    final List<Double> smoothedCopyRatios = new ArrayList<>();
    final List<Double> hetRatios = new ArrayList<>();
    for (final PileupSummary site : sites) {
        final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
        final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
        final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
        smoothedCopyRatios.add(averageNearbyCopyRatio);
        final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
        final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
        final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
        hetRatios.add(hetRatio);
    }
    final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
    final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
    final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
    final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
    //TODO: as extra security, filter out sites that are near too many hom alts
    logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
    logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
    final Set<Integer> badSites = new TreeSet<>();
    badSites.addAll(indicesWithAnomalousCopyRatio);
    badSites.addAll(indicesWithLossOfHeterozygosity);
    return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
Also used : IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CommandLineProgram(org.broadinstitute.hellbender.cmdline.CommandLineProgram) Argument(org.broadinstitute.barclay.argparser.Argument) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) VariantProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup) File(java.io.File) Logger(org.apache.logging.log4j.Logger) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) LogManager(org.apache.logging.log4j.LogManager) TargetCollection(org.broadinstitute.hellbender.tools.exome.TargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 50 with Sum

use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.

the class AlleleFrequencyCalculator method effectiveAlleleCounts.

// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
    final int numAlleles = vc.getNAlleles();
    Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
    final double[] log10Result = new double[numAlleles];
    Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
    for (final Genotype g : vc.getGenotypes()) {
        if (!g.hasLikelihoods()) {
            continue;
        }
        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);
        final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
        new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex -> glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) -> log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.log10(count))));
    }
    return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) IntStream(java.util.stream.IntStream) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) MathArrays(org.apache.commons.math3.util.MathArrays) Dirichlet(org.broadinstitute.hellbender.utils.Dirichlet) GenotypeAlleleCounts(org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts) Collectors(java.util.stream.Collectors) GenotypeLikelihoodCalculator(org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculator) GenotypeLikelihoodCalculators(org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators) List(java.util.List) MathUtils(org.broadinstitute.hellbender.utils.MathUtils) Map(java.util.Map) VariantContext(htsjdk.variant.variantcontext.VariantContext) Utils(org.broadinstitute.hellbender.utils.Utils) IndexRange(org.broadinstitute.hellbender.utils.IndexRange) IndexRange(org.broadinstitute.hellbender.utils.IndexRange) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeLikelihoodCalculator(org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculator)

Aggregations

RealMatrix (org.apache.commons.math3.linear.RealMatrix)22 Collectors (java.util.stream.Collectors)15 java.util (java.util)12 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)12 List (java.util.List)11 IntStream (java.util.stream.IntStream)11 Logger (org.apache.logging.log4j.Logger)11 ArrayList (java.util.ArrayList)9 LogManager (org.apache.logging.log4j.LogManager)9 IOException (java.io.IOException)8 Map (java.util.Map)8 TooManyEvaluationsException (org.apache.commons.math3.exception.TooManyEvaluationsException)8 UserException (org.broadinstitute.hellbender.exceptions.UserException)8 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)8 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)8 Test (org.testng.annotations.Test)8 File (java.io.File)7 VisibleForTesting (com.google.common.annotations.VisibleForTesting)6 Arrays (java.util.Arrays)6 Nonnull (javax.annotation.Nonnull)6