use of org.apache.commons.math3.stat.Frequency in project GDSC-SMLM by aherbert.
the class EMGainAnalysis method fit.
/**
* Fit the EM-gain distribution (Gaussian * Gamma)
*
* @param h
* The distribution
*/
private void fit(int[] h) {
final int[] limits = limits(h);
final double[] x = getX(limits);
final double[] y = getY(h, limits);
Plot2 plot = new Plot2(TITLE, "ADU", "Frequency");
double yMax = Maths.max(y);
plot.setLimits(limits[0], limits[1], 0, yMax);
plot.setColor(Color.black);
plot.addPoints(x, y, Plot2.DOT);
Utils.display(TITLE, plot);
// Estimate remaining parameters.
// Assuming a gamma_distribution(shape,scale) then mean = shape * scale
// scale = gain
// shape = Photons = mean / gain
double mean = getMean(h) - bias;
// Note: if the bias is too high then the mean will be negative. Just move the bias.
while (mean < 0) {
bias -= 1;
mean += 1;
}
double photons = mean / gain;
if (simulate)
Utils.log("Simulated bias=%d, gain=%s, noise=%s, photons=%s", (int) _bias, Utils.rounded(_gain), Utils.rounded(_noise), Utils.rounded(_photons));
Utils.log("Estimate bias=%d, gain=%s, noise=%s, photons=%s", (int) bias, Utils.rounded(gain), Utils.rounded(noise), Utils.rounded(photons));
final int max = (int) x[x.length - 1];
double[] g = pdf(max, photons, gain, noise, (int) bias);
plot.setColor(Color.blue);
plot.addPoints(x, g, Plot2.LINE);
Utils.display(TITLE, plot);
// Perform a fit
CustomPowellOptimizer o = new CustomPowellOptimizer(1e-6, 1e-16, 1e-6, 1e-16);
double[] startPoint = new double[] { photons, gain, noise, bias };
int maxEval = 3000;
String[] paramNames = { "Photons", "Gain", "Noise", "Bias" };
// Set bounds
double[] lower = new double[] { 0, 0.5 * gain, 0, bias - noise };
double[] upper = new double[] { 2 * photons, 2 * gain, gain, bias + noise };
// Restart until converged.
// TODO - Maybe fix this with a better optimiser. This needs to be tested on real data.
PointValuePair solution = null;
for (int iter = 0; iter < 3; iter++) {
IJ.showStatus("Fitting histogram ... Iteration " + iter);
try {
// Basic Powell optimiser
MultivariateFunction fun = getFunction(limits, y, max, maxEval);
PointValuePair optimum = o.optimize(new MaxEval(maxEval), new ObjectiveFunction(fun), GoalType.MINIMIZE, new InitialGuess((solution == null) ? startPoint : solution.getPointRef()));
if (solution == null || optimum.getValue() < solution.getValue()) {
double[] point = optimum.getPointRef();
// Check the bounds
for (int i = 0; i < point.length; i++) {
if (point[i] < lower[i] || point[i] > upper[i]) {
throw new RuntimeException(String.format("Fit out of of estimated range: %s %f", paramNames[i], point[i]));
}
}
solution = optimum;
}
} catch (Exception e) {
IJ.log("Powell error: " + e.getMessage());
if (e instanceof TooManyEvaluationsException) {
maxEval = (int) (maxEval * 1.5);
}
}
try {
// Bounded Powell optimiser
MultivariateFunction fun = getFunction(limits, y, max, maxEval);
MultivariateFunctionMappingAdapter adapter = new MultivariateFunctionMappingAdapter(fun, lower, upper);
PointValuePair optimum = o.optimize(new MaxEval(maxEval), new ObjectiveFunction(adapter), GoalType.MINIMIZE, new InitialGuess(adapter.boundedToUnbounded((solution == null) ? startPoint : solution.getPointRef())));
double[] point = adapter.unboundedToBounded(optimum.getPointRef());
optimum = new PointValuePair(point, optimum.getValue());
if (solution == null || optimum.getValue() < solution.getValue()) {
solution = optimum;
}
} catch (Exception e) {
IJ.log("Bounded Powell error: " + e.getMessage());
if (e instanceof TooManyEvaluationsException) {
maxEval = (int) (maxEval * 1.5);
}
}
}
IJ.showStatus("");
IJ.showProgress(1);
if (solution == null) {
Utils.log("Failed to fit the distribution");
return;
}
double[] point = solution.getPointRef();
photons = point[0];
gain = point[1];
noise = point[2];
bias = (int) Math.round(point[3]);
String label = String.format("Fitted bias=%d, gain=%s, noise=%s, photons=%s", (int) bias, Utils.rounded(gain), Utils.rounded(noise), Utils.rounded(photons));
Utils.log(label);
if (simulate) {
Utils.log("Relative Error bias=%s, gain=%s, noise=%s, photons=%s", Utils.rounded(relativeError(bias, _bias)), Utils.rounded(relativeError(gain, _gain)), Utils.rounded(relativeError(noise, _noise)), Utils.rounded(relativeError(photons, _photons)));
}
// Show the PoissonGammaGaussian approximation
double[] f = null;
if (showApproximation) {
f = new double[x.length];
PoissonGammaGaussianFunction fun = new PoissonGammaGaussianFunction(1.0 / gain, noise);
final double expected = photons * gain;
for (int i = 0; i < f.length; i++) {
f[i] = fun.likelihood(x[i] - bias, expected);
//System.out.printf("x=%d, g=%f, f=%f, error=%f\n", (int) x[i], g[i], f[i],
// gdsc.smlm.fitting.utils.DoubleEquality.relativeError(g[i], f[i]));
}
yMax = Maths.maxDefault(yMax, f);
}
// Replot
g = pdf(max, photons, gain, noise, (int) bias);
plot = new Plot2(TITLE, "ADU", "Frequency");
plot.setLimits(limits[0], limits[1], 0, yMax * 1.05);
plot.setColor(Color.black);
plot.addPoints(x, y, Plot2.DOT);
plot.setColor(Color.red);
plot.addPoints(x, g, Plot2.LINE);
plot.addLabel(0, 0, label);
if (showApproximation) {
plot.setColor(Color.blue);
plot.addPoints(x, f, Plot2.LINE);
}
Utils.display(TITLE, plot);
}
use of org.apache.commons.math3.stat.Frequency in project GDSC-SMLM by aherbert.
the class PCPALMMolecules method performDistanceAnalysis.
private void performDistanceAnalysis(double[][] intraHist, int p99) {
// We want to know the fraction of distances between molecules at the 99th percentile
// that are intra- rather than inter-molecule.
// Do single linkage clustering of closest pair at this distance and count the number of
// links that are inter and intra.
// Convert molecules for clustering
ArrayList<ClusterPoint> points = new ArrayList<ClusterPoint>(molecules.size());
for (Molecule m : molecules) // Precision was used to store the molecule ID
points.add(ClusterPoint.newClusterPoint((int) m.precision, m.x, m.y, m.photons));
ClusteringEngine engine = new ClusteringEngine(Prefs.getThreads(), ClusteringAlgorithm.PARTICLE_SINGLE_LINKAGE, new IJTrackProgress());
IJ.showStatus("Clustering to check inter-molecule distances");
engine.setTrackJoins(true);
ArrayList<Cluster> clusters = engine.findClusters(points, intraHist[0][p99]);
IJ.showStatus("");
if (clusters != null) {
double[] intraIdDistances = engine.getIntraIdDistances();
double[] interIdDistances = engine.getInterIdDistances();
int all = interIdDistances.length + intraIdDistances.length;
log(" * Fraction of inter-molecule particle linkage @ %s nm = %s %%", Utils.rounded(intraHist[0][p99], 4), (all > 0) ? Utils.rounded(100.0 * interIdDistances.length / all, 4) : "0");
// Show a double cumulative histogram plot
double[][] intraIdHist = Maths.cumulativeHistogram(intraIdDistances, false);
double[][] interIdHist = Maths.cumulativeHistogram(interIdDistances, false);
// Plot
String title = TITLE + " molecule linkage distance";
Plot2 plot = new Plot2(title, "Distance", "Frequency", intraIdHist[0], intraIdHist[1]);
double max = (intraIdHist[1].length > 0) ? intraIdHist[1][intraIdHist[1].length - 1] : 0;
if (interIdHist[1].length > 0)
max = FastMath.max(max, interIdHist[1][interIdHist[1].length - 1]);
plot.setLimits(0, intraIdHist[0][intraIdHist[0].length - 1], 0, max);
plot.setColor(Color.blue);
plot.addPoints(interIdHist[0], interIdHist[1], Plot2.LINE);
plot.setColor(Color.black);
Utils.display(title, plot);
} else {
log("Aborted clustering to check inter-molecule distances");
}
}
use of org.apache.commons.math3.stat.Frequency in project GDSC-SMLM by aherbert.
the class PCPALMClusters method run.
/*
* (non-Javadoc)
*
* @see ij.plugin.PlugIn#run(java.lang.String)
*/
public void run(String arg) {
SMLMUsageTracker.recordPlugin(this.getClass(), arg);
if (!showDialog())
return;
PCPALMMolecules.logSpacer();
Utils.log(TITLE);
PCPALMMolecules.logSpacer();
long start = System.currentTimeMillis();
HistogramData histogramData;
if (fileInput) {
histogramData = loadHistogram(histogramFile);
} else {
histogramData = doClustering();
}
if (histogramData == null)
return;
float[][] hist = histogramData.histogram;
// Create a histogram of the cluster sizes
String title = TITLE + " Molecules/cluster";
String xTitle = "Molecules/cluster";
String yTitle = "Frequency";
// Create the data required for fitting and plotting
float[] xValues = Utils.createHistogramAxis(hist[0]);
float[] yValues = Utils.createHistogramValues(hist[1]);
// Plot the histogram
float yMax = Maths.max(yValues);
Plot2 plot = new Plot2(title, xTitle, yTitle, xValues, yValues);
if (xValues.length > 0) {
double xPadding = 0.05 * (xValues[xValues.length - 1] - xValues[0]);
plot.setLimits(xValues[0] - xPadding, xValues[xValues.length - 1] + xPadding, 0, yMax * 1.05);
}
Utils.display(title, plot);
HistogramData noiseData = loadNoiseHistogram(histogramData);
if (noiseData != null) {
if (subtractNoise(histogramData, noiseData)) {
// Update the histogram
title += " (noise subtracted)";
xValues = Utils.createHistogramAxis(hist[0]);
yValues = Utils.createHistogramValues(hist[1]);
yMax = Maths.max(yValues);
plot = new Plot2(title, xTitle, yTitle, xValues, yValues);
if (xValues.length > 0) {
double xPadding = 0.05 * (xValues[xValues.length - 1] - xValues[0]);
plot.setLimits(xValues[0] - xPadding, xValues[xValues.length - 1] + xPadding, 0, yMax * 1.05);
}
Utils.display(title, plot);
// Automatically save
if (autoSave) {
String newFilename = Utils.replaceExtension(histogramData.filename, ".noise.tsv");
if (saveHistogram(histogramData, newFilename)) {
Utils.log("Saved noise-subtracted histogram to " + newFilename);
}
}
}
}
// Fit the histogram
double[] fitParameters = fitBinomial(histogramData);
if (fitParameters != null) {
// Add the binomial to the histogram
int n = (int) fitParameters[0];
double p = fitParameters[1];
Utils.log("Optimal fit : N=%d, p=%s", n, Utils.rounded(p));
BinomialDistribution dist = new BinomialDistribution(n, p);
// A zero-truncated binomial was fitted.
// pi is the adjustment factor for the probability density.
double pi = 1 / (1 - dist.probability(0));
if (!fileInput) {
// Calculate the estimated number of clusters from the observed molecules:
// Actual = (Observed / p-value) / N
final double actual = (nMolecules / p) / n;
Utils.log("Estimated number of clusters : (%d / %s) / %d = %s", nMolecules, Utils.rounded(p), n, Utils.rounded(actual));
}
double[] x = new double[n + 2];
double[] y = new double[n + 2];
// Scale the values to match those on the histogram
final double normalisingFactor = count * pi;
for (int i = 0; i <= n; i++) {
x[i] = i + 0.5;
y[i] = dist.probability(i) * normalisingFactor;
}
x[n + 1] = n + 1.5;
y[n + 1] = 0;
// Redraw the plot since the limits may have changed
plot = new Plot2(title, xTitle, yTitle, xValues, yValues);
double xPadding = 0.05 * (xValues[xValues.length - 1] - xValues[0]);
plot.setLimits(xValues[0] - xPadding, xValues[xValues.length - 1] + xPadding, 0, Maths.maxDefault(yMax, y) * 1.05);
plot.setColor(Color.magenta);
plot.addPoints(x, y, Plot2.LINE);
plot.addPoints(x, y, Plot2.CIRCLE);
plot.setColor(Color.black);
Utils.display(title, plot);
}
double seconds = (System.currentTimeMillis() - start) / 1000.0;
String msg = TITLE + " complete : " + seconds + "s";
IJ.showStatus(msg);
Utils.log(msg);
return;
}
use of org.apache.commons.math3.stat.Frequency in project gatk by broadinstitute.
the class AlleleFrequencyCalculator method getLog10PNonRef.
//TODO: this should be a class of static methods once the old AFCalculator is gone.
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information. The VariantContext
* must have at least 1 alternative allele
* @param refSnpIndelPseudocounts a total hack. A length-3 vector containing Dirichlet prior pseudocounts to
* be given to ref, alt SNP, and alt indel alleles. Hack won't be necessary when we destroy the old AF calculators
* @return result (for programming convenience)
*/
@Override
public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int defaultPloidy, final int maximumAlternativeAlleles, final double[] refSnpIndelPseudocounts) {
Utils.nonNull(vc, "VariantContext cannot be null");
final int numAlleles = vc.getNAlleles();
final List<Allele> alleles = vc.getAlleles();
Utils.validateArg(numAlleles > 1, () -> "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);
final double[] priorPseudocounts = alleles.stream().mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() > 1 ? snpPseudocount : indelPseudocount)).toArray();
double[] alleleCounts = new double[numAlleles];
// log10(1/numAlleles)
final double flatLog10AlleleFrequency = -MathUtils.log10(numAlleles);
double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);
double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY;
while (alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE) {
final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies);
alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
alleleCounts = newAlleleCounts;
final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);
// first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low
// effective allele frequency that it overwhelms the genotype likelihood of a real variant
// basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant
log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights();
}
double[] log10POfZeroCountsByAllele = new double[numAlleles];
double log10PNoVariant = 0;
for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) {
continue;
}
final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
//the total probability
log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
// per allele non-log space probabilities of zero counts for this sample
// for each allele calculate the total probability of genotypes containing at least one copy of the allele
final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) -> log10ProbabilityOfNonZeroAltAlleles[alleleIndex] = MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
}
for (int allele = 0; allele < numAlleles; allele++) {
// if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
} else {
log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
}
}
}
// unfortunately AFCalculationResult expects integers for the MLE. We really should emit the EM no-integer values
// which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies
final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray();
final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles);
//skip the ref allele (index 0)
final Map<Allele, Double> log10PRefByAllele = IntStream.range(1, numAlleles).boxed().collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a]));
// we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we
// give it our posterior as its "likelihood" along with a flat dummy prior
//TODO: HACK must be negative for AFCalcResult
final double[] dummyFlatPrior = { -1e-10, -1e-10 };
final double[] log10PosteriorOfNoVariantYesVariant = { log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant) };
return new AFCalculationResult(integerAltAlleleCounts, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele);
}
use of org.apache.commons.math3.stat.Frequency in project narchy by automenta.
the class RouletteTest method testDecideRouletteTriangular.
@Test
public void testDecideRouletteTriangular() {
XorShift128PlusRandom rng = new XorShift128PlusRandom(1);
int uniques = 10;
int samples = 5000;
Frequency f = new Frequency();
for (int i = 0; i < samples; i++) f.addValue(Roulette.decideRoulette(uniques, (k) -> (k + 1f) / (uniques), rng));
System.out.println(f);
assertEquals(f.getUniqueCount(), uniques);
float total = f.getSumFreq();
for (int i = 0; i < uniques; i++) assertEquals(f.getCount(i) / total, (i + 1f) / (uniques * uniques / 2), 1f / (4 * uniques));
}
Aggregations