use of org.apache.commons.math3.random.RandomGenerator in project gatk by broadinstitute.
the class JointAFCRSegmenterUnitTest method testSegmentation.
@Test
public void testSegmentation() {
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));
// probability that a datum is a het i.e. #hets / (#hets + #targets)
final double hetProportion = 0.25;
final List<Double> trueWeights = Arrays.asList(0.2, 0.5, 0.3);
final double[] trueMinorAlleleFractions = new double[] { 0.12, 0.32, 0.5 };
final double[] trueLog2CopyRatios = new double[] { -2.0, 0.0, 1.7 };
final List<AFCRHiddenState> trueJointStates = IntStream.range(0, trueLog2CopyRatios.length).mapToObj(n -> new AFCRHiddenState(trueMinorAlleleFractions[n], trueLog2CopyRatios[n])).collect(Collectors.toList());
final double trueMemoryLength = 1e5;
final double trueCauchyWidth = 0.2;
final int initialNumCRStates = 20;
final int initialNumAFStates = 20;
final AlleleFractionGlobalParameters trueAFParams = new AlleleFractionGlobalParameters(1.0, 0.01, 0.01);
final JointAFCRHMM trueJointModel = new JointAFCRHMM(trueJointStates, trueWeights, trueMemoryLength, trueAFParams, AllelicPanelOfNormals.EMPTY_PON, trueCauchyWidth);
// generate joint truth
final int chainLength = 10000;
final List<SimpleInterval> positions = CopyRatioSegmenterUnitTest.randomPositions("chr1", chainLength, rng, trueMemoryLength / 4);
final List<Integer> trueHiddenStates = trueJointModel.generateHiddenStateChain(positions);
final List<AFCRHiddenState> trueAFCRSequence = trueHiddenStates.stream().map(trueJointModel::getHiddenStateValue).collect(Collectors.toList());
final double[] trueCopyRatioSequence = trueAFCRSequence.stream().mapToDouble(AFCRHiddenState::getLog2CopyRatio).toArray();
final double[] trueAlleleFractionSequence = trueAFCRSequence.stream().mapToDouble(AFCRHiddenState::getMinorAlleleFraction).toArray();
// generate separate af and cr data
final GammaDistribution biasGenerator = AlleleFractionSegmenterUnitTest.getGammaDistribution(trueAFParams, rng);
final double outlierProbability = trueAFParams.getOutlierProbability();
final AllelicCountCollection afData = new AllelicCountCollection();
final List<Double> crData = new ArrayList<>();
final List<Target> crTargets = new ArrayList<>();
for (int n = 0; n < positions.size(); n++) {
final SimpleInterval position = positions.get(n);
final AFCRHiddenState jointState = trueAFCRSequence.get(n);
final double minorFraction = jointState.getMinorAlleleFraction();
final double log2CopyRatio = jointState.getLog2CopyRatio();
if (rng.nextDouble() < hetProportion) {
// het datum
afData.add(AlleleFractionSegmenterUnitTest.generateAllelicCount(minorFraction, position, rng, biasGenerator, outlierProbability));
} else {
//target datum
crTargets.add(new Target(position));
crData.add(CopyRatioSegmenterUnitTest.generateData(trueCauchyWidth, log2CopyRatio, rng));
}
}
final ReadCountCollection rcc = new ReadCountCollection(crTargets, Arrays.asList("SAMPLE"), new Array2DRowRealMatrix(crData.stream().mapToDouble(x -> x).toArray()));
final JointAFCRSegmenter segmenter = JointAFCRSegmenter.createJointSegmenter(initialNumCRStates, rcc, initialNumAFStates, afData, AllelicPanelOfNormals.EMPTY_PON);
final TargetCollection<SimpleInterval> tc = new HashedListTargetCollection<>(positions);
final List<Pair<SimpleInterval, AFCRHiddenState>> segmentation = segmenter.findSegments();
final List<ACNVModeledSegment> jointSegments = segmentation.stream().map(pair -> {
final SimpleInterval position = pair.getLeft();
final AFCRHiddenState jointState = pair.getRight();
final PosteriorSummary crSummary = PerformJointSegmentation.errorlessPosterior(jointState.getLog2CopyRatio());
final PosteriorSummary afSummary = PerformJointSegmentation.errorlessPosterior(jointState.getMinorAlleleFraction());
return new ACNVModeledSegment(position, crSummary, afSummary);
}).collect(Collectors.toList());
final double[] segmentCopyRatios = jointSegments.stream().flatMap(s -> Collections.nCopies(tc.targetCount(s.getInterval()), s.getSegmentMeanPosteriorSummary().getCenter()).stream()).mapToDouble(x -> x).toArray();
final double[] segmentMinorFractions = jointSegments.stream().flatMap(s -> Collections.nCopies(tc.targetCount(s.getInterval()), s.getMinorAlleleFractionPosteriorSummary().getCenter()).stream()).mapToDouble(x -> x).toArray();
final double averageMinorFractionError = Arrays.stream(MathArrays.ebeSubtract(trueAlleleFractionSequence, segmentMinorFractions)).map(Math::abs).average().getAsDouble();
final double averageCopyRatioError = Arrays.stream(MathArrays.ebeSubtract(trueCopyRatioSequence, segmentCopyRatios)).map(Math::abs).average().getAsDouble();
Assert.assertEquals(averageMinorFractionError, 0, 0.04);
Assert.assertEquals(averageCopyRatioError, 0, 0.04);
}
use of org.apache.commons.math3.random.RandomGenerator in project gatk-protected by broadinstitute.
the class AlleleFractionSegmenterUnitTest method testChromosomesOnDifferentSegments.
@Test
public void testChromosomesOnDifferentSegments() {
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));
final double[] trueMinorAlleleFractions = new double[] { 0.12, 0.32, 0.5 };
final double trueMemoryLength = 1e5;
final AlleleFractionGlobalParameters trueParams = new AlleleFractionGlobalParameters(1.0, 0.01, 0.01);
// randomly set positions
final int chainLength = 100;
final List<SimpleInterval> positions = CopyRatioSegmenterUnitTest.randomPositions("chr1", chainLength, rng, trueMemoryLength / 4);
positions.addAll(CopyRatioSegmenterUnitTest.randomPositions("chr2", chainLength, rng, trueMemoryLength / 4));
positions.addAll(CopyRatioSegmenterUnitTest.randomPositions("chr3", chainLength, rng, trueMemoryLength / 4));
//fix everything to the same state 2
final int trueState = 2;
final List<Double> minorAlleleFractionSequence = Collections.nCopies(positions.size(), trueMinorAlleleFractions[trueState]);
final AllelicCountCollection counts = generateCounts(minorAlleleFractionSequence, positions, rng, trueParams);
final AlleleFractionSegmenter segmenter = new AlleleFractionSegmenter(10, counts, AllelicPanelOfNormals.EMPTY_PON);
final List<ModeledSegment> segments = segmenter.getModeledSegments();
//check that each chromosome has at least one segment
final int numDifferentContigsInSegments = (int) segments.stream().map(ModeledSegment::getContig).distinct().count();
Assert.assertEquals(numDifferentContigsInSegments, 3);
}
use of org.apache.commons.math3.random.RandomGenerator in project gatk-protected by broadinstitute.
the class AlleleFractionSegmenterUnitTest method generateAllelicCount.
protected static AllelicCount generateAllelicCount(final double minorFraction, final SimpleInterval position, final RandomGenerator rng, final GammaDistribution biasGenerator, final double outlierProbability) {
final int numReads = 100;
final double bias = biasGenerator.sample();
//flip a coin to decide alt minor (alt fraction = minor fraction) or ref minor (alt fraction = 1 - minor fraction)
final double altFraction = rng.nextDouble() < 0.5 ? minorFraction : 1 - minorFraction;
//the probability of an alt read is the alt fraction modified by the bias or, in the case of an outlier, random
final double pAlt = rng.nextDouble() < outlierProbability ? rng.nextDouble() : altFraction / (altFraction + (1 - altFraction) * bias);
final int numAltReads = new BinomialDistribution(rng, numReads, pAlt).sample();
final int numRefReads = numReads - numAltReads;
return new AllelicCount(position, numAltReads, numRefReads);
}
use of org.apache.commons.math3.random.RandomGenerator in project gatk-protected by broadinstitute.
the class CoverageDropoutDetector method retrieveGaussianMixtureModelForFilteredTargets.
/** <p>Produces a Gaussian mixture model based on the difference between targets and segment means.</p>
* <p>Filters targets to populations where more than the minProportion lie in a single segment.</p>
* <p>Returns null if no pass filtering. Please note that in these cases,
* in the rest of this class, we use this to assume that a GMM is not a good model.</p>
*
* @param segments -- segments with segment mean in log2 copy ratio space
* @param targets -- targets with a log2 copy ratio estimate
* @param minProportion -- minimum proportion of all targets that a given segment must have in order to be used
* in the evaluation
* @param numComponents -- number of components to use in the GMM. Usually, this is 2.
* @return never {@code null}. Fitting result with indications whether it converged or was even attempted.
*/
private MixtureMultivariateNormalFitResult retrieveGaussianMixtureModelForFilteredTargets(final List<ModeledSegment> segments, final TargetCollection<ReadCountRecord.SingleSampleRecord> targets, double minProportion, int numComponents) {
// For each target in a segment that contains enough targets, normalize the difference against the segment mean
// and collapse the filtered targets into the copy ratio estimates.
final List<Double> filteredTargetsSegDiff = getNumProbeFilteredTargetList(segments, targets, minProportion);
if (filteredTargetsSegDiff.size() < numComponents) {
return new MixtureMultivariateNormalFitResult(null, false, false);
}
// Assume that Apache Commons wants data points in the first dimension.
// Note that second dimension of length 2 (instead of 1) is to wrok around funny Apache commons API.
final double[][] filteredTargetsSegDiff2d = new double[filteredTargetsSegDiff.size()][2];
// Convert the filtered targets into 2d array (even if second dimension is length 1). The second dimension is
// uncorrelated Gaussian. This is only to get around funny API in Apache Commons, which will throw an
// exception if the length of the second dimension is < 2
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
final NormalDistribution nd = new NormalDistribution(rng, 0, .1);
for (int i = 0; i < filteredTargetsSegDiff.size(); i++) {
filteredTargetsSegDiff2d[i][0] = filteredTargetsSegDiff.get(i);
filteredTargetsSegDiff2d[i][1] = nd.sample();
}
final MixtureMultivariateNormalDistribution estimateEM0 = MultivariateNormalMixtureExpectationMaximization.estimate(filteredTargetsSegDiff2d, numComponents);
final MultivariateNormalMixtureExpectationMaximization multivariateNormalMixtureExpectationMaximization = new MultivariateNormalMixtureExpectationMaximization(filteredTargetsSegDiff2d);
try {
multivariateNormalMixtureExpectationMaximization.fit(estimateEM0);
} catch (final MaxCountExceededException | ConvergenceException | SingularMatrixException e) {
// did not converge. Include the model as it was when the exception was thrown.
return new MixtureMultivariateNormalFitResult(multivariateNormalMixtureExpectationMaximization.getFittedModel(), false, true);
}
return new MixtureMultivariateNormalFitResult(multivariateNormalMixtureExpectationMaximization.getFittedModel(), true, true);
}
use of org.apache.commons.math3.random.RandomGenerator in project gatk-protected by broadinstitute.
the class AdaptiveMetropolisSamplerUnitTest method testBeta.
@Test
public void testBeta() {
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
for (final double a : Arrays.asList(10, 20, 30)) {
for (final double b : Arrays.asList(10, 20, 30)) {
final double theoreticalMean = a / (a + b);
final double theoreticalVariance = a * b / ((a + b) * (a + b) * (a + b + 1));
//Note: this is the theoretical standard deviation of the sample mean given uncorrelated
//samples. The sample mean will have a greater variance here because samples are correlated.
final double standardDeviationOfMean = Math.sqrt(theoreticalVariance / NUM_SAMPLES);
final Function<Double, Double> logPDF = x -> (a - 1) * Math.log(x) + (b - 1) * Math.log(1 - x);
final AdaptiveMetropolisSampler sampler = new AdaptiveMetropolisSampler(INITIAL_BETA_GUESS, INITIAL_STEP_SIZE, 0, 1);
final List<Double> samples = sampler.sample(rng, logPDF, NUM_SAMPLES, NUM_BURN_IN_STEPS);
final double sampleMean = samples.stream().mapToDouble(x -> x).average().getAsDouble();
final double sampleMeanSquare = samples.stream().mapToDouble(x -> x * x).average().getAsDouble();
final double sampleVariance = (sampleMeanSquare - sampleMean * sampleMean) * NUM_SAMPLES / (NUM_SAMPLES - 1);
Assert.assertEquals(sampleMean, theoreticalMean, 10 * standardDeviationOfMean);
Assert.assertEquals(sampleVariance, theoreticalVariance, 10e-4);
}
}
}
Aggregations