use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.
the class ReadCountCollectionUtilsUnitTest method readCountAndPercentileData.
@DataProvider(name = "readCountAndPercentileData")
public Object[][] readCountAndPercentileData() {
final double[] percentiles = new double[] { 1.0, 2.5, 5.0, 10.0, 25.0 };
final List<Object[]> result = new ArrayList<>();
final Random rdn = new Random(13);
final int columnCount = 100;
final int targetCount = 100;
final List<String> columnNames = IntStream.range(0, columnCount).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
final List<Target> targets = IntStream.range(0, targetCount).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
for (final double percentile : percentiles) {
final double[][] counts = new double[columnCount][targetCount];
for (int i = 0; i < counts.length; i++) {
for (int j = 0; j < counts[0].length; j++) {
counts[i][j] = rdn.nextDouble();
}
}
final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
result.add(new Object[] { readCounts, percentile });
}
return result.toArray(new Object[result.size()][]);
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.
the class ReadCountCollectionUtilsUnitTest method testTruncateExtremeCounts.
@Test(dataProvider = "readCountAndPercentileData")
public void testTruncateExtremeCounts(final ReadCountCollection readCount, final double percentile) {
final RealMatrix counts = readCount.counts();
final double[] allCounts = Stream.of(counts.getData()).flatMap(row -> DoubleStream.of(row).boxed()).mapToDouble(Double::doubleValue).toArray();
final double bottom = new Percentile(percentile).evaluate(allCounts);
final double top = new Percentile(100 - percentile).evaluate(allCounts);
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = DoubleStream.of(counts.getRow(i)).map(d -> d < bottom ? bottom : (d > top) ? top : d).toArray();
}
ReadCountCollectionUtils.truncateExtremeCounts(readCount, percentile, NULL_LOGGER);
final RealMatrix newCounts = readCount.counts();
Assert.assertEquals(newCounts.getRowDimension(), newCounts.getRowDimension());
Assert.assertEquals(newCounts.getColumnDimension(), newCounts.getColumnDimension());
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j]);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.
the class ReadCountCollectionUtils method removeColumnsWithExtremeMedianCounts.
/**
* Creates a new read-count collection that is a copy of the input but dropping columns with extreme median coverage.
*
* @param readCounts the input read-counts.
* @param extremeColumnMedianCountPercentileThreshold bottom percentile to use as an exclusion threshold.
* @return never {@code null}. It might be a reference to the input read-counts if
* there are no columns to be dropped.
*/
public static ReadCountCollection removeColumnsWithExtremeMedianCounts(final ReadCountCollection readCounts, final double extremeColumnMedianCountPercentileThreshold, final Logger logger) {
final List<String> columnNames = readCounts.columnNames();
final RealMatrix counts = readCounts.counts();
final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
// Calculate thresholds:
final double bottomExtremeThreshold = new Percentile(extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
final double topExtremeThreshold = new Percentile(100 - extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
// Determine kept and dropped column sets.
final Set<String> columnsToKeep = new LinkedHashSet<>(readCounts.columnNames().size());
final int initialMapSize = ((int) (2.0 * extremeColumnMedianCountPercentileThreshold / 100.0)) * readCounts.columnNames().size();
final Map<String, Double> columnsToDrop = new LinkedHashMap<>(initialMapSize);
for (int i = 0; i < columnMedians.length; i++) {
if (columnMedians[i] >= bottomExtremeThreshold && columnMedians[i] <= topExtremeThreshold) {
columnsToKeep.add(columnNames.get(i));
} else {
columnsToDrop.put(columnNames.get(i), columnMedians[i]);
}
}
// log and drop columns if it applies
if (columnsToKeep.isEmpty()) {
throw new UserException.BadInput("No column count left after applying the extreme counts outlier filter");
} else if (columnsToKeep.size() == columnNames.size()) {
logger.info(String.format("No column dropped due to extreme counts outside [%.10f, %.10f]", bottomExtremeThreshold, topExtremeThreshold));
return readCounts;
} else {
final double droppedPercentage = ((double) (columnsToDrop.size()) / columnNames.size()) * 100;
logger.info(String.format("Some columns dropped (%d out of %d, %.2f%%) as they are classified as having extreme " + "median counts across targets outside [%.10f, %.10f]: e.g. %s", columnsToDrop.size(), columnNames.size(), droppedPercentage, bottomExtremeThreshold, topExtremeThreshold, columnsToDrop.entrySet().stream().limit(10).map(kv -> kv.getKey() + " (" + kv.getValue() + ")").collect(Collectors.joining(", "))));
return readCounts.subsetColumns(columnsToKeep);
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
Aggregations