Search in sources :

Example 16 with Mean

use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk-protected by broadinstitute.

the class PosteriorSummaryUtils method calculatePosteriorMode.

/**
     * Given a list of posterior samples, returns an estimate of the posterior mode (using
     * mllib kernel density estimation in {@link KernelDensity} and {@link BrentOptimizer}).
     * Note that estimate may be poor if number of samples is small (resulting in poor kernel density estimation),
     * or if posterior is not unimodal (or is sufficiently pathological otherwise). If the samples contain
     * {@link Double#NaN}, {@link Double#NaN} will be returned.
     * @param samples   posterior samples, cannot be {@code null} and number of samples must be greater than 0
     * @param ctx       {@link JavaSparkContext} used by {@link KernelDensity} for mllib kernel density estimation
     */
public static double calculatePosteriorMode(final List<Double> samples, final JavaSparkContext ctx) {
    Utils.nonNull(samples);
    Utils.validateArg(samples.size() > 0, "Number of samples must be greater than zero.");
    //calculate sample min, max, mean, and standard deviation
    final double sampleMin = Collections.min(samples);
    final double sampleMax = Collections.max(samples);
    final double sampleMean = new Mean().evaluate(Doubles.toArray(samples));
    final double sampleStandardDeviation = new StandardDeviation().evaluate(Doubles.toArray(samples));
    //if samples are all the same or contain NaN, can simply return mean
    if (sampleStandardDeviation == 0. || Double.isNaN(sampleMean)) {
        return sampleMean;
    }
    //use Silverman's rule to set bandwidth for kernel density estimation from sample standard deviation
    //see https://en.wikipedia.org/wiki/Kernel_density_estimation#Practical_estimation_of_the_bandwidth
    final double bandwidth = SILVERMANS_RULE_CONSTANT * sampleStandardDeviation * Math.pow(samples.size(), SILVERMANS_RULE_EXPONENT);
    //use kernel density estimation to approximate posterior from samples
    final KernelDensity pdf = new KernelDensity().setSample(ctx.parallelize(samples, 1)).setBandwidth(bandwidth);
    //use Brent optimization to find mode (i.e., maximum) of kernel-density-estimated posterior
    final BrentOptimizer optimizer = new BrentOptimizer(RELATIVE_TOLERANCE, RELATIVE_TOLERANCE * (sampleMax - sampleMin));
    final UnivariateObjectiveFunction objective = new UnivariateObjectiveFunction(f -> pdf.estimate(new double[] { f })[0]);
    //search for mode within sample range, start near sample mean
    final SearchInterval searchInterval = new SearchInterval(sampleMin, sampleMax, sampleMean);
    return optimizer.optimize(objective, GoalType.MAXIMIZE, searchInterval, BRENT_MAX_EVAL).getPoint();
}
Also used : Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) SearchInterval(org.apache.commons.math3.optim.univariate.SearchInterval) UnivariateObjectiveFunction(org.apache.commons.math3.optim.univariate.UnivariateObjectiveFunction) BrentOptimizer(org.apache.commons.math3.optim.univariate.BrentOptimizer) KernelDensity(org.apache.spark.mllib.stat.KernelDensity) StandardDeviation(org.apache.commons.math3.stat.descriptive.moment.StandardDeviation)

Example 17 with Mean

use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk by broadinstitute.

the class CoverageModelParameters method generateRandomModel.

/**
     * Generates random coverage model parameters.
     *
     * @param targetList list of targets
     * @param numLatents number of latent variables
     * @param seed random seed
     * @param randomMeanLogBiasStandardDeviation std of mean log bias (mean is set to 0)
     * @param randomBiasCovariatesStandardDeviation std of bias covariates (mean is set to 0)
     * @param randomMaxUnexplainedVariance max value of unexplained variance (samples are taken from a uniform
     *                                     distribution [0, {@code randomMaxUnexplainedVariance}])
     * @param initialBiasCovariatesARDCoefficients initial row vector of ARD coefficients
     * @return an instance of {@link CoverageModelParameters}
     */
public static CoverageModelParameters generateRandomModel(final List<Target> targetList, final int numLatents, final long seed, final double randomMeanLogBiasStandardDeviation, final double randomBiasCovariatesStandardDeviation, final double randomMaxUnexplainedVariance, final INDArray initialBiasCovariatesARDCoefficients) {
    Utils.validateArg(numLatents >= 0, "Dimension of the bias space must be non-negative");
    Utils.validateArg(randomBiasCovariatesStandardDeviation >= 0, "Standard deviation of random bias covariates" + " must be non-negative");
    Utils.validateArg(randomMeanLogBiasStandardDeviation >= 0, "Standard deviation of random mean log bias" + " must be non-negative");
    Utils.validateArg(randomMaxUnexplainedVariance >= 0, "Max random unexplained variance must be non-negative");
    Utils.validateArg(initialBiasCovariatesARDCoefficients == null || numLatents > 0 && initialBiasCovariatesARDCoefficients.length() == numLatents, "If ARD is enabled, the dimension" + " of the bias latent space must be positive and match the length of ARD coeffecient vector");
    final boolean biasCovariatesEnabled = numLatents > 0;
    final int numTargets = targetList.size();
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(seed));
    /* Gaussian random for mean log bias */
    final INDArray initialMeanLogBias = Nd4j.create(getNormalRandomNumbers(numTargets, 0, randomMeanLogBiasStandardDeviation, rng), new int[] { 1, numTargets });
    /* Uniform random for unexplained variance */
    final INDArray initialUnexplainedVariance = Nd4j.create(getUniformRandomNumbers(numTargets, 0, randomMaxUnexplainedVariance, rng), new int[] { 1, numTargets });
    final INDArray initialMeanBiasCovariates;
    if (biasCovariatesEnabled) {
        /* Gaussian random for bias covariates */
        initialMeanBiasCovariates = Nd4j.create(getNormalRandomNumbers(numTargets * numLatents, 0, randomBiasCovariatesStandardDeviation, rng), new int[] { numTargets, numLatents });
    } else {
        initialMeanBiasCovariates = null;
    }
    return new CoverageModelParameters(targetList, initialMeanLogBias, initialUnexplainedVariance, initialMeanBiasCovariates, initialBiasCovariatesARDCoefficients);
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) RandomGenerator(org.apache.commons.math3.random.RandomGenerator)

Example 18 with Mean

use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk by broadinstitute.

the class CoverageModelParameters method write.

/**
     * Writes the model to disk.
     *
     * @param outputPath model output path
     */
public static void write(@Nonnull CoverageModelParameters model, @Nonnull final String outputPath) {
    /* create output directory if it doesn't exist */
    createOutputPath(Utils.nonNull(outputPath, "The output path string must be non-null"));
    /* write targets list */
    final File targetListFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_LIST_OUTPUT_FILE);
    TargetWriter.writeTargetsToFile(targetListFile, model.getTargetList());
    final List<String> targetNames = model.getTargetList().stream().map(Target::getName).collect(Collectors.toList());
    /* write target mean bias to file */
    final File targetMeanBiasFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_MEAN_LOG_BIAS_OUTPUT_FILE);
    Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getTargetMeanLogBias().transpose(), targetMeanBiasFile, MEAN_LOG_BIAS_MATRIX_NAME, targetNames, null);
    /* write target unexplained variance to file */
    final File targetUnexplainedVarianceFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_UNEXPLAINED_VARIANCE_OUTPUT_FILE);
    Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getTargetUnexplainedVariance().transpose(), targetUnexplainedVarianceFile, TARGET_UNEXPLAINED_VARIANCE_MATRIX_NAME, targetNames, null);
    if (model.isBiasCovariatesEnabled()) {
        /* write mean bias covariates to file */
        final List<String> meanBiasCovariatesNames = IntStream.range(0, model.getNumLatents()).mapToObj(li -> String.format(BIAS_COVARIATE_COLUMN_NAME_FORMAT, li)).collect(Collectors.toList());
        final File meanBiasCovariatesFile = new File(outputPath, CoverageModelGlobalConstants.MEAN_BIAS_COVARIATES_OUTPUT_FILE);
        Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getMeanBiasCovariates(), meanBiasCovariatesFile, MEAN_BIAS_COVARIATES_MATRIX_NAME, targetNames, meanBiasCovariatesNames);
        /* write norm_2 of mean bias covariates to file */
        final INDArray WTW = model.getMeanBiasCovariates().transpose().mmul(model.getMeanBiasCovariates());
        final double[] biasCovariatesNorm2 = IntStream.range(0, model.getNumLatents()).mapToDouble(li -> WTW.getDouble(li, li)).toArray();
        final File biasCovariatesNorm2File = new File(outputPath, CoverageModelGlobalConstants.MEAN_BIAS_COVARIATES_NORM2_OUTPUT_FILE);
        Nd4jIOUtils.writeNDArrayMatrixToTextFile(Nd4j.create(biasCovariatesNorm2, new int[] { 1, model.getNumLatents() }), biasCovariatesNorm2File, MEAN_BIAS_COVARIATES_NORM_2_MATRIX_NAME, null, meanBiasCovariatesNames);
        /* if ARD is enabled, write the ARD coefficients and covariance of W as well */
        if (model.isARDEnabled()) {
            final File biasCovariatesARDCoefficientsFile = new File(outputPath, CoverageModelGlobalConstants.BIAS_COVARIATES_ARD_COEFFICIENTS_OUTPUT_FILE);
            Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getBiasCovariateARDCoefficients(), biasCovariatesARDCoefficientsFile, BIAS_COVARIATES_ARD_COEFFICIENTS_MATRIX_NAME, null, meanBiasCovariatesNames);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) java.util(java.util) NDArrayIndex(org.nd4j.linalg.indexing.NDArrayIndex) Nd4jIOUtils(org.broadinstitute.hellbender.tools.coveragemodel.nd4jutils.Nd4jIOUtils) Nd4j(org.nd4j.linalg.factory.Nd4j) Collectors(java.util.stream.Collectors) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) Sets(com.google.cloud.dataflow.sdk.repackaged.com.google.common.collect.Sets) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Pair(org.apache.commons.lang3.tuple.Pair) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) RandomGenerator(org.apache.commons.math3.random.RandomGenerator) RandomGeneratorFactory(org.apache.commons.math3.random.RandomGeneratorFactory) Target(org.broadinstitute.hellbender.tools.exome.Target) TargetTableReader(org.broadinstitute.hellbender.tools.exome.TargetTableReader) INDArray(org.nd4j.linalg.api.ndarray.INDArray) TargetWriter(org.broadinstitute.hellbender.tools.exome.TargetWriter) Utils(org.broadinstitute.hellbender.utils.Utils) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) INDArray(org.nd4j.linalg.api.ndarray.INDArray)

Example 19 with Mean

use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testCalculateReducedPanelAndPInversesUsingJollifesRule.

@Test(dataProvider = "readCountOnlyWithDiverseShapeData")
public void testCalculateReducedPanelAndPInversesUsingJollifesRule(final ReadCountCollection readCounts) {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final ReductionResult result = HDF5PCACoveragePoNCreationUtils.calculateReducedPanelAndPInverses(readCounts, OptionalInt.empty(), NULL_LOGGER, ctx);
    final RealMatrix counts = readCounts.counts();
    Assert.assertNotNull(result);
    Assert.assertNotNull(result.getPseudoInverse());
    Assert.assertNotNull(result.getReducedCounts());
    Assert.assertNotNull(result.getReducedPseudoInverse());
    Assert.assertNotNull(result.getAllSingularValues());
    Assert.assertEquals(counts.getColumnDimension(), result.getAllSingularValues().length);
    Assert.assertEquals(result.getReducedCounts().getRowDimension(), counts.getRowDimension());
    final int eigensamples = result.getReducedCounts().getColumnDimension();
    final Mean mean = new Mean();
    final double meanSingularValue = mean.evaluate(result.getAllSingularValues());
    final double threshold = HDF5PCACoveragePoNCreationUtils.JOLLIFES_RULE_MEAN_FACTOR * meanSingularValue;
    final int expectedEigensamples = (int) DoubleStream.of(result.getAllSingularValues()).filter(d -> d >= threshold).count();
    Assert.assertTrue(eigensamples <= counts.getColumnDimension());
    Assert.assertEquals(eigensamples, expectedEigensamples);
    assertPseudoInverse(counts, result.getPseudoInverse());
    assertPseudoInverse(result.getReducedCounts(), result.getReducedPseudoInverse());
}
Also used : Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 20 with Mean

use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk by broadinstitute.

the class HDF5LibraryUnitTest method createMatrixOfGaussianValues.

private RealMatrix createMatrixOfGaussianValues(int numRows, int numCols, final double mean, final double sigma) {
    final RealMatrix bigCounts = new Array2DRowRealMatrix(numRows, numCols);
    final RandomDataGenerator randomDataGenerator = new RandomDataGenerator();
    randomDataGenerator.reSeed(337337337);
    bigCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(int row, int column, double value) {
            return randomDataGenerator.nextGaussian(mean, sigma);
        }
    });
    return bigCounts;
}
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RandomDataGenerator(org.apache.commons.math3.random.RandomDataGenerator) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor)

Aggregations

Test (org.testng.annotations.Test)27 Mean (org.apache.commons.math3.stat.descriptive.moment.Mean)23 List (java.util.List)17 RandomGenerator (org.apache.commons.math3.random.RandomGenerator)16 RealMatrix (org.apache.commons.math3.linear.RealMatrix)14 ArrayList (java.util.ArrayList)12 Collectors (java.util.stream.Collectors)12 StandardDeviation (org.apache.commons.math3.stat.descriptive.moment.StandardDeviation)12 Utils (org.broadinstitute.hellbender.utils.Utils)12 StoredDataStatistics (gdsc.core.utils.StoredDataStatistics)10 Arrays (java.util.Arrays)10 IntStream (java.util.stream.IntStream)10 NormalDistribution (org.apache.commons.math3.distribution.NormalDistribution)10 WeightedObservedPoint (org.apache.commons.math3.fitting.WeightedObservedPoint)10 Logger (org.apache.logging.log4j.Logger)10 ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)10 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)10 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)10 Function (java.util.function.Function)9 DescriptiveStatistics (org.apache.commons.math3.stat.descriptive.DescriptiveStatistics)9