use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk-protected by broadinstitute.
the class CopyRatioModellerUnitTest method testRunMCMCOnCopyRatioSegmentedGenome.
/**
* Tests Bayesian inference of the copy-ratio model via MCMC.
* <p>
* Recovery of input values for the variance and outlier-probability global parameters is checked.
* In particular, the true input value of the variance must fall within
* {@link CopyRatioModellerUnitTest#MULTIPLES_OF_SD_THRESHOLD}
* standard deviations of the posterior mean and the standard deviation of the posterior must agree
* with the analytic value to within a relative error of
* {@link CopyRatioModellerUnitTest#RELATIVE_ERROR_THRESHOLD} for 250 samples
* (after 250 burn-in samples have been discarded). Similar criteria are applied
* to the recovery of the true input value for the outlier probability.
* </p>
* <p>
* Furthermore, the number of truth values for the segment-level means falling outside confidence intervals of
* 1-sigma, 2-sigma, and 3-sigma given by the posteriors in each segment should be roughly consistent with
* a normal distribution (i.e., ~32, ~5, and ~0, respectively; we allow for errors of
* {@link CopyRatioModellerUnitTest#DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_1_SIGMA},
* {@link CopyRatioModellerUnitTest#DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_2_SIGMA}, and
* {@link CopyRatioModellerUnitTest#DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_3_SIGMA}, respectively).
* The mean of the standard deviations of the posteriors for the segment-level means should also be
* recovered to within a relative error of {@link CopyRatioModellerUnitTest#RELATIVE_ERROR_THRESHOLD}.
* </p>
* <p>
* Finally, the recovered values for the latent outlier-indicator parameters should agree with those used to
* generate the data. For each indicator, the recovered value (i.e., outlier or non-outlier) is taken to be
* that given by the majority of posterior samples. We require that at least
* {@link CopyRatioModellerUnitTest#FRACTION_OF_OUTLIER_INDICATORS_CORRECT_THRESHOLD}
* of the 10000 indicators are recovered correctly.
* </p>
* <p>
* With these specifications, this unit test is not overly brittle (i.e., it should pass for a large majority
* of randomly generated data sets), but it is still brittle enough to check for correctness of the sampling
* (for example, specifying a sufficiently incorrect likelihood will cause the test to fail).
* </p>
*/
@Test
public void testRunMCMCOnCopyRatioSegmentedGenome() throws IOException {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
LoggingUtils.setLoggingLevel(Log.LogLevel.INFO);
//load data (coverages and number of targets in each segment)
final ReadCountCollection coverage = ReadCountCollectionUtils.parse(COVERAGES_FILE);
//Genome with no SNPs
final Genome genome = new Genome(coverage, Collections.emptyList());
final SegmentedGenome segmentedGenome = new SegmentedGenome(SEGMENT_FILE, genome);
//run MCMC
final CopyRatioModeller modeller = new CopyRatioModeller(segmentedGenome);
modeller.fitMCMC(NUM_SAMPLES, NUM_BURN_IN);
//check statistics of global-parameter posterior samples (i.e., posterior mode and standard deviation)
final Map<CopyRatioParameter, PosteriorSummary> globalParameterPosteriorSummaries = modeller.getGlobalParameterPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
final PosteriorSummary variancePosteriorSummary = globalParameterPosteriorSummaries.get(CopyRatioParameter.VARIANCE);
final double variancePosteriorCenter = variancePosteriorSummary.getCenter();
final double variancePosteriorStandardDeviation = (variancePosteriorSummary.getUpper() - variancePosteriorSummary.getLower()) / 2;
Assert.assertEquals(Math.abs(variancePosteriorCenter - VARIANCE_TRUTH), 0., MULTIPLES_OF_SD_THRESHOLD * VARIANCE_POSTERIOR_STANDARD_DEVIATION_TRUTH);
Assert.assertEquals(relativeError(variancePosteriorStandardDeviation, VARIANCE_POSTERIOR_STANDARD_DEVIATION_TRUTH), 0., RELATIVE_ERROR_THRESHOLD);
final PosteriorSummary outlierProbabilityPosteriorSummary = globalParameterPosteriorSummaries.get(CopyRatioParameter.OUTLIER_PROBABILITY);
final double outlierProbabilityPosteriorCenter = outlierProbabilityPosteriorSummary.getCenter();
final double outlierProbabilityPosteriorStandardDeviation = (outlierProbabilityPosteriorSummary.getUpper() - outlierProbabilityPosteriorSummary.getLower()) / 2;
Assert.assertEquals(Math.abs(outlierProbabilityPosteriorCenter - OUTLIER_PROBABILITY_TRUTH), 0., MULTIPLES_OF_SD_THRESHOLD * OUTLIER_PROBABILITY_POSTERIOR_STANDARD_DEVIATION_TRUTH);
Assert.assertEquals(relativeError(outlierProbabilityPosteriorStandardDeviation, OUTLIER_PROBABILITY_POSTERIOR_STANDARD_DEVIATION_TRUTH), 0., RELATIVE_ERROR_THRESHOLD);
//check statistics of segment-mean posterior samples (i.e., posterior means and standard deviations)
final List<Double> meansTruth = loadList(MEANS_TRUTH_FILE, Double::parseDouble);
int numMeansOutsideOneSigma = 0;
int numMeansOutsideTwoSigma = 0;
int numMeansOutsideThreeSigma = 0;
final int numSegments = meansTruth.size();
//segment-mean posteriors are expected to be Gaussian, so PosteriorSummary for
// {@link CopyRatioModellerUnitTest#CREDIBLE_INTERVAL_ALPHA}=0.32 is
//(posterior mean, posterior mean - posterior standard devation, posterior mean + posterior standard deviation)
final List<PosteriorSummary> meanPosteriorSummaries = modeller.getSegmentMeansPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
final double[] meanPosteriorStandardDeviations = new double[numSegments];
for (int segment = 0; segment < numSegments; segment++) {
final double meanPosteriorCenter = meanPosteriorSummaries.get(segment).getCenter();
final double meanPosteriorStandardDeviation = (meanPosteriorSummaries.get(segment).getUpper() - meanPosteriorSummaries.get(segment).getLower()) / 2.;
meanPosteriorStandardDeviations[segment] = meanPosteriorStandardDeviation;
final double absoluteDifferenceFromTruth = Math.abs(meanPosteriorCenter - meansTruth.get(segment));
if (absoluteDifferenceFromTruth > meanPosteriorStandardDeviation) {
numMeansOutsideOneSigma++;
}
if (absoluteDifferenceFromTruth > 2 * meanPosteriorStandardDeviation) {
numMeansOutsideTwoSigma++;
}
if (absoluteDifferenceFromTruth > 3 * meanPosteriorStandardDeviation) {
numMeansOutsideThreeSigma++;
}
}
final double meanPosteriorStandardDeviationsMean = new Mean().evaluate(meanPosteriorStandardDeviations);
Assert.assertEquals(numMeansOutsideOneSigma, 100 - 68, DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_1_SIGMA);
Assert.assertEquals(numMeansOutsideTwoSigma, 100 - 95, DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_2_SIGMA);
Assert.assertTrue(numMeansOutsideThreeSigma <= DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_3_SIGMA);
Assert.assertEquals(relativeError(meanPosteriorStandardDeviationsMean, MEAN_POSTERIOR_STANDARD_DEVIATION_MEAN_TRUTH), 0., RELATIVE_ERROR_THRESHOLD);
//check accuracy of latent outlier-indicator posterior samples
final List<CopyRatioState.OutlierIndicators> outlierIndicatorSamples = modeller.getOutlierIndicatorsSamples();
int numIndicatorsCorrect = 0;
final int numIndicatorSamples = outlierIndicatorSamples.size();
final List<Integer> outlierIndicatorsTruthAsInt = loadList(OUTLIER_INDICATORS_TRUTH_FILE, Integer::parseInt);
final List<Boolean> outlierIndicatorsTruth = outlierIndicatorsTruthAsInt.stream().map(i -> i == 1).collect(Collectors.toList());
for (int target = 0; target < coverage.targets().size(); target++) {
int numSamplesOutliers = 0;
for (final CopyRatioState.OutlierIndicators sample : outlierIndicatorSamples) {
if (sample.get(target)) {
numSamplesOutliers++;
}
}
//take predicted state of indicator to be given by the majority of samples
if ((numSamplesOutliers >= numIndicatorSamples / 2.) == outlierIndicatorsTruth.get(target)) {
numIndicatorsCorrect++;
}
}
final double fractionOfOutlierIndicatorsCorrect = (double) numIndicatorsCorrect / coverage.targets().size();
Assert.assertTrue(fractionOfOutlierIndicatorsCorrect >= FRACTION_OF_OUTLIER_INDICATORS_CORRECT_THRESHOLD);
}
use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk by broadinstitute.
the class PosteriorSummaryUtils method calculatePosteriorMode.
/**
* Given a list of posterior samples, returns an estimate of the posterior mode (using
* mllib kernel density estimation in {@link KernelDensity} and {@link BrentOptimizer}).
* Note that estimate may be poor if number of samples is small (resulting in poor kernel density estimation),
* or if posterior is not unimodal (or is sufficiently pathological otherwise). If the samples contain
* {@link Double#NaN}, {@link Double#NaN} will be returned.
* @param samples posterior samples, cannot be {@code null} and number of samples must be greater than 0
* @param ctx {@link JavaSparkContext} used by {@link KernelDensity} for mllib kernel density estimation
*/
public static double calculatePosteriorMode(final List<Double> samples, final JavaSparkContext ctx) {
Utils.nonNull(samples);
Utils.validateArg(samples.size() > 0, "Number of samples must be greater than zero.");
//calculate sample min, max, mean, and standard deviation
final double sampleMin = Collections.min(samples);
final double sampleMax = Collections.max(samples);
final double sampleMean = new Mean().evaluate(Doubles.toArray(samples));
final double sampleStandardDeviation = new StandardDeviation().evaluate(Doubles.toArray(samples));
//if samples are all the same or contain NaN, can simply return mean
if (sampleStandardDeviation == 0. || Double.isNaN(sampleMean)) {
return sampleMean;
}
//use Silverman's rule to set bandwidth for kernel density estimation from sample standard deviation
//see https://en.wikipedia.org/wiki/Kernel_density_estimation#Practical_estimation_of_the_bandwidth
final double bandwidth = SILVERMANS_RULE_CONSTANT * sampleStandardDeviation * Math.pow(samples.size(), SILVERMANS_RULE_EXPONENT);
//use kernel density estimation to approximate posterior from samples
final KernelDensity pdf = new KernelDensity().setSample(ctx.parallelize(samples, 1)).setBandwidth(bandwidth);
//use Brent optimization to find mode (i.e., maximum) of kernel-density-estimated posterior
final BrentOptimizer optimizer = new BrentOptimizer(RELATIVE_TOLERANCE, RELATIVE_TOLERANCE * (sampleMax - sampleMin));
final UnivariateObjectiveFunction objective = new UnivariateObjectiveFunction(f -> pdf.estimate(new double[] { f })[0]);
//search for mode within sample range, start near sample mean
final SearchInterval searchInterval = new SearchInterval(sampleMin, sampleMax, sampleMean);
return optimizer.optimize(objective, GoalType.MAXIMIZE, searchInterval, BRENT_MAX_EVAL).getPoint();
}
use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk-protected by broadinstitute.
the class ReCapSegCaller method calculateT.
private static double calculateT(final ReadCountCollection tangentNormalizedCoverage, final List<ModeledSegment> segments) {
//Get the segments that are likely copy neutral.
// Math.abs removed to mimic python...
final List<ModeledSegment> copyNeutralSegments = segments.stream().filter(s -> s.getSegmentMean() < COPY_NEUTRAL_CUTOFF).collect(Collectors.toList());
// Get the targets that correspond to the copyNeutralSegments... note that individual targets, due to noise,
// can be far away from copy neutral
final TargetCollection<ReadCountRecord.SingleSampleRecord> targetsWithCoverage = new HashedListTargetCollection<>(tangentNormalizedCoverage.records().stream().map(ReadCountRecord::asSingleSampleRecord).collect(Collectors.toList()));
final double[] copyNeutralTargetsCopyRatio = copyNeutralSegments.stream().flatMap(s -> targetsWithCoverage.targets(s).stream()).mapToDouble(ReadCountRecord.SingleSampleRecord::getCount).toArray();
final double meanCopyNeutralTargets = new Mean().evaluate(copyNeutralTargetsCopyRatio);
final double sigmaCopyNeutralTargets = new StandardDeviation().evaluate(copyNeutralTargetsCopyRatio);
// Now we filter outliers by only including those w/in 2 standard deviations.
final double[] filteredCopyNeutralTargetsCopyRatio = Arrays.stream(copyNeutralTargetsCopyRatio).filter(c -> Math.abs(c - meanCopyNeutralTargets) < sigmaCopyNeutralTargets * Z_THRESHOLD).toArray();
return new StandardDeviation().evaluate(filteredCopyNeutralTargetsCopyRatio);
}
use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testCalculateReducedPanelAndPInversesUsingJollifesRule.
@Test(dataProvider = "readCountOnlyWithDiverseShapeData")
public void testCalculateReducedPanelAndPInversesUsingJollifesRule(final ReadCountCollection readCounts) {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final ReductionResult result = HDF5PCACoveragePoNCreationUtils.calculateReducedPanelAndPInverses(readCounts, OptionalInt.empty(), NULL_LOGGER, ctx);
final RealMatrix counts = readCounts.counts();
Assert.assertNotNull(result);
Assert.assertNotNull(result.getPseudoInverse());
Assert.assertNotNull(result.getReducedCounts());
Assert.assertNotNull(result.getReducedPseudoInverse());
Assert.assertNotNull(result.getAllSingularValues());
Assert.assertEquals(counts.getColumnDimension(), result.getAllSingularValues().length);
Assert.assertEquals(result.getReducedCounts().getRowDimension(), counts.getRowDimension());
final int eigensamples = result.getReducedCounts().getColumnDimension();
final Mean mean = new Mean();
final double meanSingularValue = mean.evaluate(result.getAllSingularValues());
final double threshold = HDF5PCACoveragePoNCreationUtils.JOLLIFES_RULE_MEAN_FACTOR * meanSingularValue;
final int expectedEigensamples = (int) DoubleStream.of(result.getAllSingularValues()).filter(d -> d >= threshold).count();
Assert.assertTrue(eigensamples <= counts.getColumnDimension());
Assert.assertEquals(eigensamples, expectedEigensamples);
assertPseudoInverse(counts, result.getPseudoInverse());
assertPseudoInverse(result.getReducedCounts(), result.getReducedPseudoInverse());
}
use of org.apache.commons.math3.stat.descriptive.moment.Mean in project gatk by broadinstitute.
the class HDF5LibraryUnitTest method testCreateLargeMatrix.
@Test
public void testCreateLargeMatrix() {
// Creates a large PoN of junk values and simply tests that these can be written and read.
// Make a big, fake set of read counts.
final int numRows = 2500000;
final int numCols = 10;
final double mean = 3e-7;
final double sigma = 1e-9;
final RealMatrix bigCounts = createMatrixOfGaussianValues(numRows, numCols, mean, sigma);
final File tempOutputHD5 = IOUtils.createTempFile("big-ol-", ".hd5");
final HDF5File hdf5File = new HDF5File(tempOutputHD5, HDF5File.OpenMode.CREATE);
final String hdf5Path = "/test/m";
hdf5File.makeDoubleMatrix(hdf5Path, bigCounts.getData());
hdf5File.close();
final HDF5File hdf5FileForReading = new HDF5File(tempOutputHD5, HDF5File.OpenMode.READ_ONLY);
final double[][] result = hdf5FileForReading.readDoubleMatrix(hdf5Path);
final RealMatrix resultAsRealMatrix = new Array2DRowRealMatrix(result);
Assert.assertTrue(resultAsRealMatrix.getRowDimension() == numRows);
Assert.assertTrue(resultAsRealMatrix.getColumnDimension() == numCols);
final RealMatrix readMatrix = new Array2DRowRealMatrix(result);
PoNTestUtils.assertEqualsMatrix(readMatrix, bigCounts, false);
}
Aggregations