use of org.apache.commons.math3.geometry.euclidean.twod.Segment in project gatk by broadinstitute.
the class CNLOHCaller method calculateVarianceOfCopyNeutralSegmentMeans.
/**
* Attempt to get an idea of segment mean variance near copy neutral.
*
* @param segments Never {@code null}
* @return variance of segment mean (in CR space) of segments that are "close enough" to copy neutral.
* Zero if no segments are "close enough"
*/
private double calculateVarianceOfCopyNeutralSegmentMeans(final List<ACNVModeledSegment> segments, final double meanBiasInCR) {
Utils.nonNull(segments);
// Only consider values "close enough" to copy neutral (CR == 1).
final double neutralCR = 1 + meanBiasInCR;
final double[] neutralSegmentMeans = segments.stream().mapToDouble(ACNVModeledSegment::getSegmentMeanInCRSpace).filter(m -> Math.abs(m - neutralCR) < CLOSE_ENOUGH_TO_COPY_NEUTRAL_IN_CR).toArray();
return new Variance().evaluate(neutralSegmentMeans);
}
use of org.apache.commons.math3.geometry.euclidean.twod.Segment in project gatk by broadinstitute.
the class SNPSegmenter method writeSegmentFile.
/**
* Write segment file based on maximum-likelihood estimates of the minor allele fraction at SNP sites,
* assuming the specified allelic bias. These estimates are converted to target coverages,
* which are written to a temporary file and then passed to {@link RCBSSegmenter}.
* @param snps TargetCollection of allelic counts at SNP sites
* @param sampleName sample name
* @param outputFile segment file to write to and return
* @param allelicBias allelic bias to use in estimate of minor allele fraction
*/
public static void writeSegmentFile(final TargetCollection<AllelicCount> snps, final String sampleName, final File outputFile, final double allelicBias) {
Utils.validateArg(snps.totalSize() > 0, "Must have a positive number of SNPs to perform SNP segmentation.");
try {
final File targetsFromSNPCountsFile = File.createTempFile("targets-from-snps", ".tsv");
final List<Target> targets = snps.targets().stream().map(ac -> new Target(name(ac), ac.getInterval())).collect(Collectors.toList());
final RealMatrix minorAlleleFractions = new Array2DRowRealMatrix(snps.targetCount(), 1);
minorAlleleFractions.setColumn(0, snps.targets().stream().mapToDouble(ac -> ac.estimateMinorAlleleFraction(allelicBias)).toArray());
ReadCountCollectionUtils.write(targetsFromSNPCountsFile, new ReadCountCollection(targets, Collections.singletonList(sampleName), minorAlleleFractions));
//segment SNPs based on observed log_2 minor allele fraction (log_2 is applied in CBS.R)
RCBSSegmenter.writeSegmentFile(sampleName, targetsFromSNPCountsFile.getAbsolutePath(), outputFile.getAbsolutePath(), false);
} catch (final IOException e) {
throw new UserException.CouldNotCreateOutputFile("Could not create temporary output file during " + "SNP segmentation.", e);
}
}
use of org.apache.commons.math3.geometry.euclidean.twod.Segment in project gatk by broadinstitute.
the class CoverageDropoutDetector method retrieveGaussianMixtureModelForFilteredTargets.
/** <p>Produces a Gaussian mixture model based on the difference between targets and segment means.</p>
* <p>Filters targets to populations where more than the minProportion lie in a single segment.</p>
* <p>Returns null if no pass filtering. Please note that in these cases,
* in the rest of this class, we use this to assume that a GMM is not a good model.</p>
*
* @param segments -- segments with segment mean in log2 copy ratio space
* @param targets -- targets with a log2 copy ratio estimate
* @param minProportion -- minimum proportion of all targets that a given segment must have in order to be used
* in the evaluation
* @param numComponents -- number of components to use in the GMM. Usually, this is 2.
* @return never {@code null}. Fitting result with indications whether it converged or was even attempted.
*/
private MixtureMultivariateNormalFitResult retrieveGaussianMixtureModelForFilteredTargets(final List<ModeledSegment> segments, final TargetCollection<ReadCountRecord.SingleSampleRecord> targets, double minProportion, int numComponents) {
// For each target in a segment that contains enough targets, normalize the difference against the segment mean
// and collapse the filtered targets into the copy ratio estimates.
final List<Double> filteredTargetsSegDiff = getNumProbeFilteredTargetList(segments, targets, minProportion);
if (filteredTargetsSegDiff.size() < numComponents) {
return new MixtureMultivariateNormalFitResult(null, false, false);
}
// Assume that Apache Commons wants data points in the first dimension.
// Note that second dimension of length 2 (instead of 1) is to wrok around funny Apache commons API.
final double[][] filteredTargetsSegDiff2d = new double[filteredTargetsSegDiff.size()][2];
// Convert the filtered targets into 2d array (even if second dimension is length 1). The second dimension is
// uncorrelated Gaussian. This is only to get around funny API in Apache Commons, which will throw an
// exception if the length of the second dimension is < 2
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
final NormalDistribution nd = new NormalDistribution(rng, 0, .1);
for (int i = 0; i < filteredTargetsSegDiff.size(); i++) {
filteredTargetsSegDiff2d[i][0] = filteredTargetsSegDiff.get(i);
filteredTargetsSegDiff2d[i][1] = nd.sample();
}
final MixtureMultivariateNormalDistribution estimateEM0 = MultivariateNormalMixtureExpectationMaximization.estimate(filteredTargetsSegDiff2d, numComponents);
final MultivariateNormalMixtureExpectationMaximization multivariateNormalMixtureExpectationMaximization = new MultivariateNormalMixtureExpectationMaximization(filteredTargetsSegDiff2d);
try {
multivariateNormalMixtureExpectationMaximization.fit(estimateEM0);
} catch (final MaxCountExceededException | ConvergenceException | SingularMatrixException e) {
// did not converge. Include the model as it was when the exception was thrown.
return new MixtureMultivariateNormalFitResult(multivariateNormalMixtureExpectationMaximization.getFittedModel(), false, true);
}
return new MixtureMultivariateNormalFitResult(multivariateNormalMixtureExpectationMaximization.getFittedModel(), true, true);
}
use of org.apache.commons.math3.geometry.euclidean.twod.Segment in project gatk by broadinstitute.
the class GibbsSamplerCopyRatioUnitTest method testRunMCMCOnCopyRatioSegmentedGenome.
/**
* Tests Bayesian inference of a toy copy-ratio model via MCMC.
* <p>
* Recovery of input values for the variance global parameter and the segment-level mean parameters is checked.
* In particular, the mean and standard deviation of the posterior for the variance must be recovered to within
* a relative error of 1% and 5%, respectively, in 500 samples (after 250 burn-in samples have been discarded).
* </p>
* <p>
* Furthermore, the number of truth values for the segment-level means falling outside confidence intervals of
* 1-sigma, 2-sigma, and 3-sigma given by the posteriors in each segment should be roughly consistent with
* a normal distribution (i.e., ~32, ~5, and ~0, respectively; we allow for errors of 10, 5, and 2).
* Finally, the mean of the standard deviations of the posteriors for the segment-level means should be
* recovered to within a relative error of 5%.
* </p>
* <p>
* With these specifications, this unit test is not overly brittle (i.e., it should pass for a large majority
* of randomly generated data sets), but it is still brittle enough to check for correctness of the sampling
* (for example, specifying a sufficiently incorrect likelihood will cause the test to fail).
* </p>
*/
@Test
public void testRunMCMCOnCopyRatioSegmentedGenome() {
//Create new instance of the Modeller helper class, passing all quantities needed to initialize state and data.
final CopyRatioModeller modeller = new CopyRatioModeller(VARIANCE_INITIAL, MEAN_INITIAL, COVERAGES_FILE, NUM_TARGETS_PER_SEGMENT_FILE);
//Create a GibbsSampler, passing the total number of samples (including burn-in samples)
//and the model held by the Modeller.
final GibbsSampler<CopyRatioParameter, CopyRatioState, CopyRatioDataCollection> gibbsSampler = new GibbsSampler<>(NUM_SAMPLES, modeller.model);
//Run the MCMC.
gibbsSampler.runMCMC();
//Check that the statistics---i.e., the mean and standard deviation---of the variance posterior
//agree with those found by emcee/analytically to a relative error of 1% and 5%, respectively.
final double[] varianceSamples = Doubles.toArray(gibbsSampler.getSamples(CopyRatioParameter.VARIANCE, Double.class, NUM_BURN_IN));
final double variancePosteriorCenter = new Mean().evaluate(varianceSamples);
final double variancePosteriorStandardDeviation = new StandardDeviation().evaluate(varianceSamples);
Assert.assertEquals(relativeError(variancePosteriorCenter, VARIANCE_TRUTH), 0., RELATIVE_ERROR_THRESHOLD_FOR_CENTERS);
Assert.assertEquals(relativeError(variancePosteriorStandardDeviation, VARIANCE_POSTERIOR_STANDARD_DEVIATION_TRUTH), 0., RELATIVE_ERROR_THRESHOLD_FOR_STANDARD_DEVIATIONS);
//Check statistics---i.e., the mean and standard deviation---of the segment-level mean posteriors.
//In particular, check that the number of segments where the true mean falls outside confidence intervals
//is roughly consistent with a normal distribution.
final List<Double> meansTruth = loadList(MEANS_TRUTH_FILE, Double::parseDouble);
final int numSegments = meansTruth.size();
final List<SegmentMeans> meansSamples = gibbsSampler.getSamples(CopyRatioParameter.SEGMENT_MEANS, SegmentMeans.class, NUM_BURN_IN);
int numMeansOutsideOneSigma = 0;
int numMeansOutsideTwoSigma = 0;
int numMeansOutsideThreeSigma = 0;
final List<Double> meanPosteriorStandardDeviations = new ArrayList<>();
for (int segment = 0; segment < numSegments; segment++) {
final int j = segment;
final double[] meanInSegmentSamples = Doubles.toArray(meansSamples.stream().map(s -> s.get(j)).collect(Collectors.toList()));
final double meanPosteriorCenter = new Mean().evaluate(meanInSegmentSamples);
final double meanPosteriorStandardDeviation = new StandardDeviation().evaluate(meanInSegmentSamples);
meanPosteriorStandardDeviations.add(meanPosteriorStandardDeviation);
final double absoluteDifferenceFromTruth = Math.abs(meanPosteriorCenter - meansTruth.get(segment));
if (absoluteDifferenceFromTruth > meanPosteriorStandardDeviation) {
numMeansOutsideOneSigma++;
}
if (absoluteDifferenceFromTruth > 2 * meanPosteriorStandardDeviation) {
numMeansOutsideTwoSigma++;
}
if (absoluteDifferenceFromTruth > 3 * meanPosteriorStandardDeviation) {
numMeansOutsideThreeSigma++;
}
}
final double meanPosteriorStandardDeviationsMean = new Mean().evaluate(Doubles.toArray(meanPosteriorStandardDeviations));
Assert.assertEquals(numMeansOutsideOneSigma, 100 - 68, DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_1_SIGMA);
Assert.assertEquals(numMeansOutsideTwoSigma, 100 - 95, DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_2_SIGMA);
Assert.assertTrue(numMeansOutsideThreeSigma <= DELTA_NUMBER_OF_MEANS_ALLOWED_OUTSIDE_3_SIGMA);
Assert.assertEquals(relativeError(meanPosteriorStandardDeviationsMean, MEAN_POSTERIOR_STANDARD_DEVIATION_MEAN_TRUTH), 0., RELATIVE_ERROR_THRESHOLD_FOR_STANDARD_DEVIATIONS);
}
use of org.apache.commons.math3.geometry.euclidean.twod.Segment in project gatk-protected by broadinstitute.
the class ReCapSegCallerUnitTest method testMakeCalls.
@Test
public void testMakeCalls() {
final List<Target> targets = new ArrayList<>();
final List<String> columnNames = Arrays.asList("Sample");
final List<Double> coverage = new ArrayList<>();
//add amplification targets
for (int i = 0; i < 10; i++) {
final SimpleInterval interval = new SimpleInterval("chr", 100 + 2 * i, 101 + 2 * i);
targets.add(new Target(interval));
coverage.add(ParamUtils.log2(2.0));
}
//add deletion targets
for (int i = 0; i < 10; i++) {
final SimpleInterval interval = new SimpleInterval("chr", 300 + 2 * i, 301 + 2 * i);
targets.add(new Target(interval));
coverage.add(ParamUtils.log2(0.5));
}
//add targets that don't belong to a segment
for (int i = 1; i < 10; i++) {
final SimpleInterval interval = new SimpleInterval("chr", 400 + 2 * i, 401 + 2 * i);
targets.add(new Target(interval));
coverage.add(ParamUtils.log2(1.0));
}
//add obviously neutral targets with some small spread
for (int i = -5; i < 6; i++) {
final SimpleInterval interval = new SimpleInterval("chr", 500 + 2 * i, 501 + 2 * i);
targets.add(new Target(interval));
coverage.add(ParamUtils.log2(0.01 * i + 1));
}
//add spread-out targets to a neutral segment (mean near zero)
for (int i = -5; i < 6; i++) {
final SimpleInterval interval = new SimpleInterval("chr", 700 + 2 * i, 701 + 2 * i);
targets.add(new Target(interval));
coverage.add(ParamUtils.log2(0.1 * i + 1));
}
final RealMatrix coverageMatrix = new Array2DRowRealMatrix(targets.size(), 1);
coverageMatrix.setColumn(0, coverage.stream().mapToDouble(x -> x).toArray());
final int n = targets.size();
final int m = coverageMatrix.getRowDimension();
final ReadCountCollection counts = new ReadCountCollection(targets, columnNames, coverageMatrix);
List<ModeledSegment> segments = new ArrayList<>();
//amplification
segments.add(new ModeledSegment(new SimpleInterval("chr", 100, 200), 100, ParamUtils.log2(2.0)));
//deletion
segments.add(new ModeledSegment(new SimpleInterval("chr", 300, 400), 100, ParamUtils.log2(0.5)));
//neutral
segments.add(new ModeledSegment(new SimpleInterval("chr", 450, 550), 100, ParamUtils.log2(1)));
//neutral
segments.add(new ModeledSegment(new SimpleInterval("chr", 650, 750), 100, ParamUtils.log2(1)));
List<ModeledSegment> calls = ReCapSegCaller.makeCalls(counts, segments);
Assert.assertEquals(calls.get(0).getCall(), ReCapSegCaller.AMPLIFICATION_CALL);
Assert.assertEquals(calls.get(1).getCall(), ReCapSegCaller.DELETION_CALL);
Assert.assertEquals(calls.get(2).getCall(), ReCapSegCaller.NEUTRAL_CALL);
Assert.assertEquals(calls.get(3).getCall(), ReCapSegCaller.NEUTRAL_CALL);
}
Aggregations