use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.
the class ReadCountCollectionUtilsUnitTest method testExtremeMedianColumnsData.
@Test(dataProvider = "readCountAndPercentileData")
public void testExtremeMedianColumnsData(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double top = new Percentile(100 - percentile).evaluate(columnMedians);
final double bottom = new Percentile(percentile).evaluate(columnMedians);
final Boolean[] toBeKept = DoubleStream.of(columnMedians).mapToObj(d -> d <= top && d >= bottom).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final ReadCountCollection result = ReadCountCollectionUtils.removeColumnsWithExtremeMedianCounts(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.columnNames().size(), toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.columnNames().indexOf(readCount.columnNames().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getColumn(i), result.counts().getColumn(index));
} else {
Assert.assertEquals(result.columnNames().indexOf(readCount.columnNames().get(i)), -1);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method readCountAndPercentileData.
// this is duplicated from ReadCountCollectionUtilsUnitTest
@DataProvider(name = "readCountAndPercentileData")
public Object[][] readCountAndPercentileData() {
final double[] percentiles = new double[] { 1.0, 2.5, 5.0, 10.0, 25.0 };
final List<Object[]> result = new ArrayList<>();
final Random rdn = new Random(13);
final int columnCount = 100;
final int targetCount = 100;
final List<String> columnNames = IntStream.range(0, columnCount).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
final List<Target> targets = IntStream.range(0, targetCount).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
for (final double percentile : percentiles) {
final double[][] counts = new double[columnCount][targetCount];
for (int i = 0; i < counts.length; i++) {
for (int j = 0; j < counts[0].length; j++) {
counts[i][j] = rdn.nextDouble();
}
}
final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
result.add(new Object[] { readCounts, percentile });
}
return result.toArray(new Object[result.size()][]);
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method readCountAndPercentileData.
// this is duplicated from ReadCountCollectionUtilsUnitTest
@DataProvider(name = "readCountAndPercentileData")
public Object[][] readCountAndPercentileData() {
final double[] percentiles = new double[] { 1.0, 2.5, 5.0, 10.0, 25.0 };
final List<Object[]> result = new ArrayList<>();
final Random rdn = new Random(13);
final int columnCount = 100;
final int targetCount = 100;
final List<String> columnNames = IntStream.range(0, columnCount).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
final List<Target> targets = IntStream.range(0, targetCount).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
for (final double percentile : percentiles) {
final double[][] counts = new double[columnCount][targetCount];
for (int i = 0; i < counts.length; i++) {
for (int j = 0; j < counts[0].length; j++) {
counts[i][j] = rdn.nextDouble();
}
}
final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
result.add(new Object[] { readCounts, percentile });
}
return result.toArray(new Object[result.size()][]);
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.
the class ReadCountCollectionUtils method truncateExtremeCounts.
/**
* Truncates the extreme count values in the input read-count collection.
* Values are forced to be bound by the percentile indicated with the input {@code percentile} which must be
* in the range [0 .. 50.0]. Values under that percentile and the complementary (1 - percentile) are set to the
* corresponding threshold value.
*
* <p>The imputation is done in-place, thus the input matrix is modified as a result of this call.</p>
*
* @param readCounts the input and output read-count matrix.
*/
public static void truncateExtremeCounts(final ReadCountCollection readCounts, final double percentile, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final int targetCount = counts.getRowDimension();
final int columnCount = counts.getColumnDimension();
// Create a row major array of the counts.
final double[] values = Doubles.concat(counts.getData());
final Percentile bottomPercentileEvaluator = new Percentile(percentile);
final Percentile topPercentileEvaluator = new Percentile(100.0 - percentile);
final double bottomPercentileThreshold = bottomPercentileEvaluator.evaluate(values);
final double topPercentileThreshold = topPercentileEvaluator.evaluate(values);
long totalCounts = 0;
long bottomTruncatedCounts = 0;
long topTruncatedCounts = 0;
for (int i = 0; i < targetCount; i++) {
final double[] rowCounts = counts.getRow(i);
for (int j = 0; j < columnCount; j++) {
final double count = rowCounts[j];
totalCounts++;
if (count < bottomPercentileThreshold) {
counts.setEntry(i, j, bottomPercentileThreshold);
bottomTruncatedCounts++;
} else if (count > topPercentileThreshold) {
counts.setEntry(i, j, topPercentileThreshold);
topTruncatedCounts++;
}
}
}
if (topTruncatedCounts == 0 && bottomTruncatedCounts == 0) {
logger.info(String.format("None of the %d counts were truncated as they all fall in the non-extreme range " + "[%.2f, %.2f]", totalCounts, bottomPercentileThreshold, topPercentileThreshold));
} else {
final double truncatedPercentage = ((double) (topTruncatedCounts + bottomTruncatedCounts) / totalCounts) * 100;
logger.info(String.format("Some counts (%d out of %d, %.2f%%) were truncated as they fall out of the " + "non-extreme range [%.2f, %.2f]", topTruncatedCounts + bottomTruncatedCounts, totalCounts, truncatedPercentage, bottomPercentileThreshold, topPercentileThreshold));
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.
the class ReadCountCollectionUtils method truncateExtremeCounts.
/**
* Truncates the extreme count values in the input read-count collection.
* Values are forced to be bound by the percentile indicated with the input {@code percentile} which must be
* in the range [0 .. 50.0]. Values under that percentile and the complementary (1 - percentile) are set to the
* corresponding threshold value.
*
* <p>The imputation is done in-place, thus the input matrix is modified as a result of this call.</p>
*
* @param readCounts the input and output read-count matrix.
*/
public static void truncateExtremeCounts(final ReadCountCollection readCounts, final double percentile, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final int targetCount = counts.getRowDimension();
final int columnCount = counts.getColumnDimension();
// Create a row major array of the counts.
final double[] values = Doubles.concat(counts.getData());
final Percentile bottomPercentileEvaluator = new Percentile(percentile);
final Percentile topPercentileEvaluator = new Percentile(100.0 - percentile);
final double bottomPercentileThreshold = bottomPercentileEvaluator.evaluate(values);
final double topPercentileThreshold = topPercentileEvaluator.evaluate(values);
long totalCounts = 0;
long bottomTruncatedCounts = 0;
long topTruncatedCounts = 0;
for (int i = 0; i < targetCount; i++) {
final double[] rowCounts = counts.getRow(i);
for (int j = 0; j < columnCount; j++) {
final double count = rowCounts[j];
totalCounts++;
if (count < bottomPercentileThreshold) {
counts.setEntry(i, j, bottomPercentileThreshold);
bottomTruncatedCounts++;
} else if (count > topPercentileThreshold) {
counts.setEntry(i, j, topPercentileThreshold);
topTruncatedCounts++;
}
}
}
if (topTruncatedCounts == 0 && bottomTruncatedCounts == 0) {
logger.info(String.format("None of the %d counts were truncated as they all fall in the non-extreme range " + "[%.2f, %.2f]", totalCounts, bottomPercentileThreshold, topPercentileThreshold));
} else {
final double truncatedPercentage = ((double) (topTruncatedCounts + bottomTruncatedCounts) / totalCounts) * 100;
logger.info(String.format("Some counts (%d out of %d, %.2f%%) were truncated as they fall out of the " + "non-extreme range [%.2f, %.2f]", topTruncatedCounts + bottomTruncatedCounts, totalCounts, truncatedPercentage, bottomPercentileThreshold, topPercentileThreshold));
}
}
Aggregations