use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project GDSC-SMLM by aherbert.
the class DoubletAnalysis method summariseResults.
/**
* Summarise results.
*
* @param results the results
* @param density the density
* @param runTime the run time
*/
private void summariseResults(ArrayList<DoubletResult> results, double density, long runTime) {
// If we are only assessing results with no neighbour candidates
// TODO - Count the number of actual results that have no neighbours
final int numberOfMolecules = this.results.size() - ignored.get();
final FitConfiguration fitConfig = config.getFitConfiguration();
// Store details we want in the analysis table
final StringBuilder sb = new StringBuilder();
sb.append(MathUtils.rounded(density)).append('\t');
sb.append(MathUtils.rounded(getSa())).append('\t');
sb.append(config.getFittingWidth()).append('\t');
sb.append(PsfProtosHelper.getName(fitConfig.getPsfType()));
sb.append(":").append(PeakFit.getSolverName(fitConfig));
if (fitConfig.isModelCameraMle()) {
sb.append(":Camera\t");
// Add details of the noise model for the MLE
final CalibrationReader r = new CalibrationReader(fitConfig.getCalibration());
sb.append("EM=").append(r.isEmCcd());
sb.append(":A=").append(MathUtils.rounded(r.getCountPerElectron()));
sb.append(":N=").append(MathUtils.rounded(r.getReadNoise()));
sb.append('\t');
} else {
sb.append("\t\t");
}
final String analysisPrefix = sb.toString();
// -=-=-=-=-
showResults(results, settings.showResults);
sb.setLength(0);
final int n = countN(results);
// Create the benchmark settings and the fitting settings
sb.append(numberOfMolecules).append('\t');
sb.append(n).append('\t');
sb.append(MathUtils.rounded(density)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.minSignal)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.maxSignal)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.averageSignal)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.sd)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.pixelPitch)).append('\t');
sb.append(MathUtils.rounded(getSa() * simulationParameters.pixelPitch)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.gain)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.readNoise)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.background)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.noise)).append('\t');
sb.append(MathUtils.rounded(simulationParameters.averageSignal / simulationParameters.noise)).append('\t');
sb.append(config.getFittingWidth()).append('\t');
sb.append(PsfProtosHelper.getName(fitConfig.getPsfType()));
sb.append(":").append(PeakFit.getSolverName(fitConfig));
if (fitConfig.isModelCameraMle()) {
sb.append(":Camera\t");
// Add details of the noise model for the MLE
final CalibrationReader r = new CalibrationReader(fitConfig.getCalibration());
sb.append("EM=").append(r.isEmCcd());
sb.append(":A=").append(MathUtils.rounded(r.getCountPerElectron()));
sb.append(":N=").append(MathUtils.rounded(r.getReadNoise()));
sb.append('\t');
} else {
sb.append("\t\t");
}
// Now output the actual results ...
// Show histograms as cumulative to avoid problems with bin width
// Residuals scores
// Iterations and evaluations where fit was OK
final StoredDataStatistics[] stats = new StoredDataStatistics[Settings.NAMES2.length];
for (int i = 0; i < stats.length; i++) {
stats[i] = new StoredDataStatistics();
}
// For Jaccard scoring we need to count the score with no residuals threshold,
// i.e. Accumulate the score accepting all doublets that were fit
double tp = 0;
double fp = 0;
double bestTp = 0;
double bestFp = 0;
final ArrayList<DoubletBonus> data = new ArrayList<>(results.size());
for (final DoubletResult result : results) {
final double score = result.getMaxScore();
// Filter the singles that would be accepted
if (result.good1) {
// Filter the doublets that would be accepted
if (result.good2) {
final double tp2 = result.tp2a + result.tp2b;
final double fp2 = result.fp2a + result.fp2b;
tp += tp2;
fp += fp2;
if (result.tp2a > 0.5) {
bestTp += result.tp2a;
bestFp += result.fp2a;
}
if (result.tp2b > 0.5) {
bestTp += result.tp2b;
bestFp += result.fp2b;
}
// Store this as a doublet bonus
data.add(new DoubletBonus(score, result.getAvScore(), tp2 - result.tp1, fp2 - result.fp1));
} else {
// No doublet fit so this will always be the single fit result
tp += result.tp1;
fp += result.fp1;
if (result.tp1 > 0.5) {
bestTp += result.tp1;
bestFp += result.fp1;
}
}
}
// Build statistics
final int c = result.matchClass;
// True results, i.e. where there was a choice between single or doublet
if (result.valid) {
stats[c].add(score);
}
// Of those where the fit was good, summarise the iterations and evaluations
if (result.good1) {
stats[3].add(result.iter1);
stats[4].add(result.eval1);
// about the iteration increase for singles that are not doublets.
if (c != 0 && result.good2) {
stats[5].add(result.iter2);
stats[6].add(result.eval2);
}
}
}
// Debug the counts
// double tpSingle = 0;
// double fpSingle = 0;
// double tpDoublet = 0;
// double fpDoublet = 0;
// int nSingle = 0, nDoublet = 0;
// for (DoubletResult result : results) {
// if (result.good1) {
// if (result.good2) {
// tpDoublet += result.tp2a + result.tp2b;
// fpDoublet += result.fp2a + result.fp2b;
// nDoublet++;
// }
// tpSingle += result.tp1;
// fpSingle += result.fp1;
// nSingle++;
// }
// }
// System.out.printf("Single %.1f,%.1f (%d) : Doublet %.1f,%.1f (%d)\n", tpSingle, fpSingle,
// nSingle, tpDoublet, fpDoublet, nDoublet * 2);
// Summarise score for true results
final Percentile p = new Percentile(99);
for (int c = 0; c < stats.length; c++) {
final double[] values = stats[c].getValues();
// Sorting is need for the percentile and the cumulative histogram so do it once
Arrays.sort(values);
sb.append(MathUtils.rounded(stats[c].getMean())).append("+/-").append(MathUtils.rounded(stats[c].getStandardDeviation())).append(" (").append(stats[c].getN()).append(") ").append(MathUtils.rounded(p.evaluate(values))).append('\t');
if (settings.showHistograms && settings.displayHistograms[c + Settings.NAMES.length]) {
showCumulativeHistogram(values, Settings.NAMES2[c]);
}
}
sb.append(Settings.MATCHING_METHODS[settings.matchingMethod]).append('\t');
// Plot a graph of the additional results we would fit at all score thresholds.
// This assumes we just pick the the doublet if we fit it (NO FILTERING at all!)
// Initialise the score for residuals 0
// Store this as it serves as a baseline for the filtering analysis
final ResidualsScore residualsScoreMax = computeScores(data, tp, fp, numberOfMolecules, true);
final ResidualsScore residualsScoreAv = computeScores(data, tp, fp, numberOfMolecules, false);
residualsScore = (settings.useMaxResiduals) ? residualsScoreMax : residualsScoreAv;
if (settings.showJaccardPlot) {
plotJaccard(residualsScore, null);
}
final String bestJaccard = MathUtils.rounded(bestTp / (bestFp + numberOfMolecules)) + '\t';
final String analysisPrefix2 = analysisPrefix + bestJaccard;
sb.append(bestJaccard);
addJaccardScores(sb);
sb.append('\t').append(TextUtils.nanosToString(runTime));
createSummaryTable().append(sb.toString());
// Store results in memory for later analysis
referenceResults.set(new ReferenceResults(results, residualsScoreMax, residualsScoreAv, numberOfMolecules, analysisPrefix2));
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project GDSC-SMLM by aherbert.
the class DataEstimator method getPercentile.
/**
* Get the percentile value of the data.
*
* @param percentile The percentile
* @return the percentile value
*/
public float getPercentile(double percentile) {
// Check the input
if (percentile <= 0) {
percentile = Double.MIN_NORMAL;
}
if (percentile > 100) {
percentile = 100;
}
// The data should not have NaN so we ignore them for speed.
final Percentile p = new Percentile(percentile).withNaNStrategy(NaNStrategy.FIXED);
final int size = width * height;
final double[] values = new double[size];
for (int i = 0; i < size; i++) {
values[i] = data[i];
}
return (float) p.evaluate(values);
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project metron by apache.
the class HLLPMeasurement method main.
public static void main(String[] args) {
Options options = new Options();
try {
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = ParserOptions.parse(parser, args);
} catch (ParseException pe) {
pe.printStackTrace();
final HelpFormatter usageFormatter = new HelpFormatter();
usageFormatter.printHelp("HLLPMeasurement", null, options, null, true);
System.exit(-1);
}
if (cmd.hasOption("h")) {
final HelpFormatter usageFormatter = new HelpFormatter();
usageFormatter.printHelp("HLLPMeasurement", null, options, null, true);
System.exit(0);
}
final String chartDelim = ParserOptions.CHART_DELIM.get(cmd, "|");
final int numTrials = Integer.parseInt(ParserOptions.NUM_TRIALS.get(cmd, "5000"));
final int cardMin = Integer.parseInt(ParserOptions.CARD_MIN.get(cmd, "200"));
final int cardMax = Integer.parseInt(ParserOptions.CARD_MAX.get(cmd, "1000"));
final int cardStep = Integer.parseInt(ParserOptions.CARD_STEP.get(cmd, "200"));
final int cardStart = (((cardMin - 1) / cardStep) * cardStep) + cardStep;
final int spMin = Integer.parseInt(ParserOptions.SP_MIN.get(cmd, "4"));
final int spMax = Integer.parseInt(ParserOptions.SP_MAX.get(cmd, "32"));
final int spStep = Integer.parseInt(ParserOptions.SP_STEP.get(cmd, "4"));
final int pMin = Integer.parseInt(ParserOptions.P_MIN.get(cmd, "4"));
final int pMax = Integer.parseInt(ParserOptions.P_MAX.get(cmd, "32"));
final int pStep = Integer.parseInt(ParserOptions.P_STEP.get(cmd, "4"));
final double errorPercentile = Double.parseDouble(ParserOptions.ERR_PERCENTILE.get(cmd, "50"));
final double timePercentile = Double.parseDouble(ParserOptions.TIME_PERCENTILE.get(cmd, "50"));
final double sizePercentile = Double.parseDouble(ParserOptions.SIZE_PERCENTILE.get(cmd, "50"));
final boolean formatErrPercent = Boolean.parseBoolean(ParserOptions.ERR_FORMAT_PERCENT.get(cmd, "true"));
final int errMultiplier = formatErrPercent ? 100 : 1;
final Function<Double, String> errorFormatter = (v -> ERR_FORMAT.format(v * errMultiplier));
final Function<Double, String> timeFormatter = (v -> TIME_FORMAT.format(v / NANO_TO_MILLIS));
final Function<Double, String> sizeFormatter = (v -> SIZE_FORMAT.format(v));
final String[] chartKey = new String[] { "card: cardinality", "sp: sparse precision value", "p: normal precision value", "err: error as a percent of the expected cardinality; ", "time: total time to add all values to the hllp estimator and calculate a cardinality estimate", "size: size of the hllp set in bytes once all values have been added for the specified cardinality", "l=low, m=mid(based on percentile chosen), h=high, std=standard deviation" };
final String[] chartHeader = new String[] { "card", "sp", "p", "err l/m/h/std (% of actual)", "time l/m/h/std (ms)", "size l/m/h/std (b)" };
final int[] chartPadding = new int[] { 10, 10, 10, 40, 40, 30 };
if (spMin < pMin) {
throw new IllegalArgumentException("p must be <= sp");
}
if (spMax < pMax) {
throw new IllegalArgumentException("p must be <= sp");
}
println("Options Used");
println("------------");
println("num trials: " + numTrials);
println("card min: " + cardMin);
println("card max: " + cardMax);
println("card step: " + cardStep);
println("card start: " + cardStart);
println("sp min: " + spMin);
println("sp max: " + spMax);
println("sp step: " + spStep);
println("p min: " + pMin);
println("p max: " + pMax);
println("p step: " + pStep);
println("error percentile: " + errorPercentile);
println("time percentile: " + timePercentile);
println("size percentile: " + sizePercentile);
println("format err as %: " + formatErrPercent);
println("");
printHeading(chartKey, chartHeader, chartPadding, chartDelim);
for (int c = cardStart; c <= cardMax; c += cardStep) {
for (int sp = spMin; sp <= spMax; sp += spStep) {
for (int p = pMin; p <= pMax; p += pStep) {
DescriptiveStatistics errorStats = new DescriptiveStatistics();
DescriptiveStatistics timeStats = new DescriptiveStatistics();
DescriptiveStatistics sizeStats = new DescriptiveStatistics();
for (int i = 0; i < numTrials; i++) {
List<Object> trialSet = buildTrialSet(c);
Set unique = new HashSet();
unique.addAll(trialSet);
long distinctVals = unique.size();
HyperLogLogPlus hllp = new HyperLogLogPlus(p, sp);
long timeStart = System.nanoTime();
hllp.addAll(trialSet);
long dvEstimate = hllp.cardinality();
long timeEnd = System.nanoTime();
long timeElapsed = timeEnd - timeStart;
double rawError = Math.abs(dvEstimate - distinctVals) / (double) distinctVals;
errorStats.addValue(rawError);
timeStats.addValue(timeElapsed);
sizeStats.addValue(SerDeUtils.toBytes(hllp).length);
}
MeasureResultFormatter errorRF = new MeasureResultFormatter(errorStats, errorFormatter, errorPercentile);
MeasureResultFormatter timeRF = new MeasureResultFormatter(timeStats, timeFormatter, timePercentile);
MeasureResultFormatter sizeRF = new MeasureResultFormatter(sizeStats, sizeFormatter, sizePercentile);
println(formatWithPadding(new String[] { "" + c, "" + sp, "" + p, errorRF.getFormattedResults(), timeRF.getFormattedResults(), sizeRF.getFormattedResults() }, chartPadding, chartDelim));
}
}
}
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project GDSC-SMLM by aherbert.
the class BenchmarkSpotFit method showDoubleHistogram.
private double[] showDoubleHistogram(StoredDataStatistics[][] stats, final int i, WindowOrganiser wo, double[][] matchScores, double nPredicted) {
String xLabel = filterCriteria[i].name;
LowerLimit lower = filterCriteria[i].lower;
UpperLimit upper = filterCriteria[i].upper;
double[] j = null;
double[] metric = null;
double maxJ = 0;
if (i <= FILTER_PRECISION && (showFilterScoreHistograms || upper.requiresJaccard || lower.requiresJaccard)) {
// Jaccard score verses the range of the metric
Arrays.sort(matchScores, new Comparator<double[]>() {
public int compare(double[] o1, double[] o2) {
if (o1[i] < o2[i])
return -1;
if (o1[i] > o2[i])
return 1;
return 0;
}
});
final int scoreIndex = FILTER_PRECISION + 1;
int n = results.size();
double tp = 0;
double fp = 0;
j = new double[matchScores.length + 1];
metric = new double[j.length];
for (int k = 0; k < matchScores.length; k++) {
final double score = matchScores[k][scoreIndex];
tp += score;
fp += (1 - score);
j[k + 1] = tp / (fp + n);
metric[k + 1] = matchScores[k][i];
}
metric[0] = metric[1];
maxJ = Maths.max(j);
if (showFilterScoreHistograms) {
String title = TITLE + " Jaccard " + xLabel;
Plot plot = new Plot(title, xLabel, "Jaccard", metric, j);
// Remove outliers
double[] limitsx = Maths.limits(metric);
Percentile p = new Percentile();
double l = p.evaluate(metric, 25);
double u = p.evaluate(metric, 75);
double iqr = 1.5 * (u - l);
limitsx[1] = Math.min(limitsx[1], u + iqr);
plot.setLimits(limitsx[0], limitsx[1], 0, Maths.max(j));
PlotWindow pw = Utils.display(title, plot);
if (Utils.isNewWindow())
wo.add(pw);
}
}
// [0] is all
// [1] is matches
// [2] is no match
StoredDataStatistics s1 = stats[0][i];
StoredDataStatistics s2 = stats[1][i];
StoredDataStatistics s3 = stats[2][i];
if (s1.getN() == 0)
return new double[4];
DescriptiveStatistics d = s1.getStatistics();
double median = 0;
Plot2 plot = null;
String title = null;
if (showFilterScoreHistograms) {
median = d.getPercentile(50);
String label = String.format("n = %d. Median = %s nm", s1.getN(), Utils.rounded(median));
int id = Utils.showHistogram(TITLE, s1, xLabel, filterCriteria[i].minBinWidth, (filterCriteria[i].restrictRange) ? 1 : 0, 0, label);
if (id == 0) {
IJ.log("Failed to show the histogram: " + xLabel);
return new double[4];
}
if (Utils.isNewWindow())
wo.add(id);
title = WindowManager.getImage(id).getTitle();
// Reverse engineer the histogram settings
plot = Utils.plot;
double[] xValues = Utils.xValues;
int bins = xValues.length;
double yMin = xValues[0];
double binSize = xValues[1] - xValues[0];
double yMax = xValues[0] + (bins - 1) * binSize;
if (s2.getN() > 0) {
double[] values = s2.getValues();
double[][] hist = Utils.calcHistogram(values, yMin, yMax, bins);
if (hist[0].length > 0) {
plot.setColor(Color.red);
plot.addPoints(hist[0], hist[1], Plot2.BAR);
Utils.display(title, plot);
}
}
if (s3.getN() > 0) {
double[] values = s3.getValues();
double[][] hist = Utils.calcHistogram(values, yMin, yMax, bins);
if (hist[0].length > 0) {
plot.setColor(Color.blue);
plot.addPoints(hist[0], hist[1], Plot2.BAR);
Utils.display(title, plot);
}
}
}
// Do cumulative histogram
double[][] h1 = Maths.cumulativeHistogram(s1.getValues(), true);
double[][] h2 = Maths.cumulativeHistogram(s2.getValues(), true);
double[][] h3 = Maths.cumulativeHistogram(s3.getValues(), true);
if (showFilterScoreHistograms) {
title = TITLE + " Cumul " + xLabel;
plot = new Plot2(title, xLabel, "Frequency");
// Find limits
double[] xlimit = Maths.limits(h1[0]);
xlimit = Maths.limits(xlimit, h2[0]);
xlimit = Maths.limits(xlimit, h3[0]);
// Restrict using the inter-quartile range
if (filterCriteria[i].restrictRange) {
double q1 = d.getPercentile(25);
double q2 = d.getPercentile(75);
double iqr = (q2 - q1) * 2.5;
xlimit[0] = Maths.max(xlimit[0], median - iqr);
xlimit[1] = Maths.min(xlimit[1], median + iqr);
}
plot.setLimits(xlimit[0], xlimit[1], 0, 1.05);
plot.addPoints(h1[0], h1[1], Plot.LINE);
plot.setColor(Color.red);
plot.addPoints(h2[0], h2[1], Plot.LINE);
plot.setColor(Color.blue);
plot.addPoints(h3[0], h3[1], Plot.LINE);
}
// Determine the maximum difference between the TP and FP
double maxx1 = 0;
double maxx2 = 0;
double max1 = 0;
double max2 = 0;
// We cannot compute the delta histogram, or use percentiles
if (s2.getN() == 0) {
upper = UpperLimit.ZERO;
lower = LowerLimit.ZERO;
}
final boolean requireLabel = (showFilterScoreHistograms && filterCriteria[i].requireLabel);
if (requireLabel || upper.requiresDeltaHistogram() || lower.requiresDeltaHistogram()) {
if (s2.getN() != 0 && s3.getN() != 0) {
LinearInterpolator li = new LinearInterpolator();
PolynomialSplineFunction f1 = li.interpolate(h2[0], h2[1]);
PolynomialSplineFunction f2 = li.interpolate(h3[0], h3[1]);
for (double x : h1[0]) {
if (x < h2[0][0] || x < h3[0][0])
continue;
try {
double v1 = f1.value(x);
double v2 = f2.value(x);
double diff = v2 - v1;
if (diff > 0) {
if (max1 < diff) {
max1 = diff;
maxx1 = x;
}
} else {
if (max2 > diff) {
max2 = diff;
maxx2 = x;
}
}
} catch (OutOfRangeException e) {
// Because we reached the end
break;
}
}
} else {
// Switch to percentiles if we have no delta histogram
if (upper.requiresDeltaHistogram())
upper = UpperLimit.NINETY_NINE_PERCENT;
if (lower.requiresDeltaHistogram())
lower = LowerLimit.ONE_PERCENT;
}
// System.out.printf("Bounds %s : %s, pos %s, neg %s, %s\n", xLabel, Utils.rounded(getPercentile(h2, 0.01)),
// Utils.rounded(maxx1), Utils.rounded(maxx2), Utils.rounded(getPercentile(h1, 0.99)));
}
if (showFilterScoreHistograms) {
// We use bins=1 on charts where we do not need a label
if (requireLabel) {
String label = String.format("Max+ %s @ %s, Max- %s @ %s", Utils.rounded(max1), Utils.rounded(maxx1), Utils.rounded(max2), Utils.rounded(maxx2));
plot.setColor(Color.black);
plot.addLabel(0, 0, label);
}
PlotWindow pw = Utils.display(title, plot);
if (Utils.isNewWindow())
wo.add(pw.getImagePlus().getID());
}
// Now compute the bounds using the desired limit
double l, u;
switch(lower) {
case ONE_PERCENT:
l = getPercentile(h2, 0.01);
break;
case MAX_NEGATIVE_CUMUL_DELTA:
l = maxx2;
break;
case ZERO:
l = 0;
break;
case HALF_MAX_JACCARD_VALUE:
l = getValue(metric, j, maxJ * 0.5);
break;
default:
throw new RuntimeException("Missing lower limit method");
}
switch(upper) {
case MAX_POSITIVE_CUMUL_DELTA:
u = maxx1;
break;
case NINETY_NINE_PERCENT:
u = getPercentile(h2, 0.99);
break;
case NINETY_NINE_NINE_PERCENT:
u = getPercentile(h2, 0.999);
break;
case ZERO:
u = 0;
break;
case MAX_JACCARD2:
u = getValue(metric, j, maxJ) * 2;
//System.out.printf("MaxJ = %.4f @ %.3f\n", maxJ, u / 2);
break;
default:
throw new RuntimeException("Missing upper limit method");
}
double min = getPercentile(h1, 0);
double max = getPercentile(h1, 1);
return new double[] { l, u, min, max };
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.
the class ReadCountCollectionUtils method removeColumnsWithExtremeMedianCounts.
/**
* Creates a new read-count collection that is a copy of the input but dropping columns with extreme median coverage.
*
* @param readCounts the input read-counts.
* @param extremeColumnMedianCountPercentileThreshold bottom percentile to use as an exclusion threshold.
* @return never {@code null}. It might be a reference to the input read-counts if
* there are no columns to be dropped.
*/
public static ReadCountCollection removeColumnsWithExtremeMedianCounts(final ReadCountCollection readCounts, final double extremeColumnMedianCountPercentileThreshold, final Logger logger) {
final List<String> columnNames = readCounts.columnNames();
final RealMatrix counts = readCounts.counts();
final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
// Calculate thresholds:
final double bottomExtremeThreshold = new Percentile(extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
final double topExtremeThreshold = new Percentile(100 - extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
// Determine kept and dropped column sets.
final Set<String> columnsToKeep = new LinkedHashSet<>(readCounts.columnNames().size());
final int initialMapSize = ((int) (2.0 * extremeColumnMedianCountPercentileThreshold / 100.0)) * readCounts.columnNames().size();
final Map<String, Double> columnsToDrop = new LinkedHashMap<>(initialMapSize);
for (int i = 0; i < columnMedians.length; i++) {
if (columnMedians[i] >= bottomExtremeThreshold && columnMedians[i] <= topExtremeThreshold) {
columnsToKeep.add(columnNames.get(i));
} else {
columnsToDrop.put(columnNames.get(i), columnMedians[i]);
}
}
// log and drop columns if it applies
if (columnsToKeep.isEmpty()) {
throw new UserException.BadInput("No column count left after applying the extreme counts outlier filter");
} else if (columnsToKeep.size() == columnNames.size()) {
logger.info(String.format("No column dropped due to extreme counts outside [%.10f, %.10f]", bottomExtremeThreshold, topExtremeThreshold));
return readCounts;
} else {
final double droppedPercentage = ((double) (columnsToDrop.size()) / columnNames.size()) * 100;
logger.info(String.format("Some columns dropped (%d out of %d, %.2f%%) as they are classified as having extreme " + "median counts across targets outside [%.10f, %.10f]: e.g. %s", columnsToDrop.size(), columnNames.size(), droppedPercentage, bottomExtremeThreshold, topExtremeThreshold, columnsToDrop.entrySet().stream().limit(10).map(kv -> kv.getKey() + " (" + kv.getValue() + ")").collect(Collectors.joining(", "))));
return readCounts.subsetColumns(columnsToKeep);
}
}
Aggregations