use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.
/**
* Impute zero counts to the median of non-zero values in the enclosing target row.
*
* <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
*
* @param readCounts the input and output read-count matrix.
*/
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final int targetCount = counts.getRowDimension();
final Median medianCalculator = new Median();
int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
// Get the number of zeroes contained in the counts.
final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
// Get the median of each row, not including any entries that are zero.
final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
// Change any zeros in the counts to the median for the row.
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value != 0 ? value : medians[row];
}
});
if (totalZeroCounts > 0) {
final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
} else {
logger.info("No count is 0.0 thus no count needed to be imputed.");
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testNormalizeAndLogReadCounts.
@Test(dataProvider = "readCountOnlyData")
public void testNormalizeAndLogReadCounts(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double epsilon = HDF5PCACoveragePoNCreationUtils.EPSILON;
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] /= columnMedians[j];
if (expected[i][j] < epsilon) {
expected[i][j] = epsilon;
}
expected[i][j] = Math.log(expected[i][j]) / Math.log(2);
}
}
HDF5PCACoveragePoNCreationUtils.normalizeAndLogReadCounts(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method normalizeAndLogReadCounts.
/**
* Final pre-panel normalization that consists of dividing all counts by the median of
* its column and log it with base 2.
* <p>
* The normalization occurs in-place.
* </p>
*
* @param readCounts the input counts to normalize.
*/
@VisibleForTesting
static void normalizeAndLogReadCounts(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final Median medianCalculator = new Median();
final double[] medians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(col -> medianCalculator.evaluate(counts.getColumn(col))).toArray();
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return Math.log(Math.max(EPSILON, value / medians[column])) * INV_LN_2;
}
});
logger.info("Counts normalized by the column median and log2'd.");
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method subtractMedianOfMedians.
/**
* Calculates the median of column medians and subtract it from all counts.
* @param readCounts the input counts to center.
* @return the median of medians that has been subtracted from all counts.
*/
@VisibleForTesting
static double subtractMedianOfMedians(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final Median medianCalculator = new Median();
final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
final double medianOfMedians = medianCalculator.evaluate(columnMedians);
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value - medianOfMedians;
}
});
logger.info(String.format("Counts centered around the median of medians %.2f", medianOfMedians));
return medianOfMedians;
}
Aggregations