use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testNormalizeAndLogReadCounts.
@Test(dataProvider = "readCountOnlyData")
public void testNormalizeAndLogReadCounts(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double epsilon = HDF5PCACoveragePoNCreationUtils.EPSILON;
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] /= columnMedians[j];
if (expected[i][j] < epsilon) {
expected[i][j] = epsilon;
}
expected[i][j] = Math.log(expected[i][j]) / Math.log(2);
}
}
HDF5PCACoveragePoNCreationUtils.normalizeAndLogReadCounts(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class PCATangentNormalizationUtils method composeTangentNormalizationInputMatrix.
/**
* Prepares the data to perform tangent normalization.
* <p>
* This is done by count group or column:
* <ol>
* </li>we divide counts by the column mean,</li>
* </li>then we transform value to their log_2,</li>
* </li>and finally we center them around the median.</li>
* </ol>
* </p>
*
* @param matrix input matrix.
* @return never {@code null}.
*/
private static RealMatrix composeTangentNormalizationInputMatrix(final RealMatrix matrix) {
final RealMatrix result = matrix.copy();
// step 1: divide by column means and log_2 transform
final double[] columnMeans = GATKProtectedMathUtils.columnMeans(matrix);
result.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return truncatedLog2(value / columnMeans[column]);
}
});
// step 2: subtract column medians
final double[] columnMedians = IntStream.range(0, matrix.getColumnDimension()).mapToDouble(c -> new Median().evaluate(result.getColumn(c))).toArray();
result.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value - columnMedians[column];
}
});
return result;
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class TargetCoverageSexGenotypeCalculator method getSampleReadDepthFromAutosomalTargets.
/**
* Estimates read depth per target per homolog for a given sample index in the collection.
*
* @param sampleIndex integer index of the sample in the read count collection
* @return read depth per target per homolog
*/
private double getSampleReadDepthFromAutosomalTargets(final int sampleIndex) {
final double[] readCounts = processedReadCounts.getColumnOnSpecifiedTargets(sampleIndex, autosomalTargetList, false);
final double[] readCountsNormalizedByPloidy = IntStream.range(0, readCounts.length).mapToDouble(i -> readCounts[i] / (double) autosomalTargetPloidies[i]).toArray();
return new Median().evaluate(readCountsNormalizedByPloidy);
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class ReadCountCollectionUtils method removeColumnsWithExtremeMedianCounts.
/**
* Creates a new read-count collection that is a copy of the input but dropping columns with extreme median coverage.
*
* @param readCounts the input read-counts.
* @param extremeColumnMedianCountPercentileThreshold bottom percentile to use as an exclusion threshold.
* @return never {@code null}. It might be a reference to the input read-counts if
* there are no columns to be dropped.
*/
public static ReadCountCollection removeColumnsWithExtremeMedianCounts(final ReadCountCollection readCounts, final double extremeColumnMedianCountPercentileThreshold, final Logger logger) {
final List<String> columnNames = readCounts.columnNames();
final RealMatrix counts = readCounts.counts();
final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
// Calculate thresholds:
final double bottomExtremeThreshold = new Percentile(extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
final double topExtremeThreshold = new Percentile(100 - extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
// Determine kept and dropped column sets.
final Set<String> columnsToKeep = new LinkedHashSet<>(readCounts.columnNames().size());
final int initialMapSize = ((int) (2.0 * extremeColumnMedianCountPercentileThreshold / 100.0)) * readCounts.columnNames().size();
final Map<String, Double> columnsToDrop = new LinkedHashMap<>(initialMapSize);
for (int i = 0; i < columnMedians.length; i++) {
if (columnMedians[i] >= bottomExtremeThreshold && columnMedians[i] <= topExtremeThreshold) {
columnsToKeep.add(columnNames.get(i));
} else {
columnsToDrop.put(columnNames.get(i), columnMedians[i]);
}
}
// log and drop columns if it applies
if (columnsToKeep.isEmpty()) {
throw new UserException.BadInput("No column count left after applying the extreme counts outlier filter");
} else if (columnsToKeep.size() == columnNames.size()) {
logger.info(String.format("No column dropped due to extreme counts outside [%.10f, %.10f]", bottomExtremeThreshold, topExtremeThreshold));
return readCounts;
} else {
final double droppedPercentage = ((double) (columnsToDrop.size()) / columnNames.size()) * 100;
logger.info(String.format("Some columns dropped (%d out of %d, %.2f%%) as they are classified as having extreme " + "median counts across targets outside [%.10f, %.10f]: e.g. %s", columnsToDrop.size(), columnNames.size(), droppedPercentage, bottomExtremeThreshold, topExtremeThreshold, columnsToDrop.entrySet().stream().limit(10).map(kv -> kv.getKey() + " (" + kv.getValue() + ")").collect(Collectors.joining(", "))));
return readCounts.subsetColumns(columnsToKeep);
}
}
Aggregations