use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class MatrixSummaryUtils method getColumnMedians.
/**
* Return an array containing the median for each column in the given matrix.
* @param m Not {@code null}. Size MxN, where neither dimension is zero. If any entry is NaN, it is disregarded
* in the calculation.
* @return array of size N. Never {@code null}
*/
public static double[] getColumnMedians(final RealMatrix m) {
Utils.nonNull(m, "Cannot calculate medians on a null matrix.");
final Median medianCalculator = new Median();
return IntStream.range(0, m.getColumnDimension()).boxed().mapToDouble(i -> medianCalculator.evaluate(m.getColumn(i))).toArray();
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class ReadCountCollectionUtils method removeColumnsWithExtremeMedianCounts.
/**
* Creates a new read-count collection that is a copy of the input but dropping columns with extreme median coverage.
*
* @param readCounts the input read-counts.
* @param extremeColumnMedianCountPercentileThreshold bottom percentile to use as an exclusion threshold.
* @return never {@code null}. It might be a reference to the input read-counts if
* there are no columns to be dropped.
*/
public static ReadCountCollection removeColumnsWithExtremeMedianCounts(final ReadCountCollection readCounts, final double extremeColumnMedianCountPercentileThreshold, final Logger logger) {
final List<String> columnNames = readCounts.columnNames();
final RealMatrix counts = readCounts.counts();
final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
// Calculate thresholds:
final double bottomExtremeThreshold = new Percentile(extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
final double topExtremeThreshold = new Percentile(100 - extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
// Determine kept and dropped column sets.
final Set<String> columnsToKeep = new LinkedHashSet<>(readCounts.columnNames().size());
final int initialMapSize = ((int) (2.0 * extremeColumnMedianCountPercentileThreshold / 100.0)) * readCounts.columnNames().size();
final Map<String, Double> columnsToDrop = new LinkedHashMap<>(initialMapSize);
for (int i = 0; i < columnMedians.length; i++) {
if (columnMedians[i] >= bottomExtremeThreshold && columnMedians[i] <= topExtremeThreshold) {
columnsToKeep.add(columnNames.get(i));
} else {
columnsToDrop.put(columnNames.get(i), columnMedians[i]);
}
}
// log and drop columns if it applies
if (columnsToKeep.isEmpty()) {
throw new UserException.BadInput("No column count left after applying the extreme counts outlier filter");
} else if (columnsToKeep.size() == columnNames.size()) {
logger.info(String.format("No column dropped due to extreme counts outside [%.10f, %.10f]", bottomExtremeThreshold, topExtremeThreshold));
return readCounts;
} else {
final double droppedPercentage = ((double) (columnsToDrop.size()) / columnNames.size()) * 100;
logger.info(String.format("Some columns dropped (%d out of %d, %.2f%%) as they are classified as having extreme " + "median counts across targets outside [%.10f, %.10f]: e.g. %s", columnsToDrop.size(), columnNames.size(), droppedPercentage, bottomExtremeThreshold, topExtremeThreshold, columnsToDrop.entrySet().stream().limit(10).map(kv -> kv.getKey() + " (" + kv.getValue() + ")").collect(Collectors.joining(", "))));
return readCounts.subsetColumns(columnsToKeep);
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.
/**
* Impute zero counts to the median of non-zero values in the enclosing target row.
*
* <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
*
* @param readCounts the input and output read-count matrix.
*/
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final int targetCount = counts.getRowDimension();
final Median medianCalculator = new Median();
int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
// Get the number of zeroes contained in the counts.
final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
// Get the median of each row, not including any entries that are zero.
final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
// Change any zeros in the counts to the median for the row.
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value != 0 ? value : medians[row];
}
});
if (totalZeroCounts > 0) {
final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
} else {
logger.info("No count is 0.0 thus no count needed to be imputed.");
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class NormalizeSomaticReadCountsIntegrationTest method assertPreTangentNormalizedValues.
/**
* This code reconstructs the expected pre-tangent normalization counts given the input,
* and then compares against the actual pre-tangent output.
* <p>
* This method does not use any of the components that does the actual computation
* in production, so that the calculation is independent.
* </p>
* <p>
* This code also use an alternative way to calculate it to reduce overlap even further.
* </p>
* @param preTangentNormalized actual output.
* @param factorNormalized input.
*/
private void assertPreTangentNormalizedValues(final ReadCountCollection factorNormalized, final ReadCountCollection preTangentNormalized) {
final double epsilon = PCATangentNormalizationUtils.EPSILON;
final RealMatrix outCounts = preTangentNormalized.counts();
final RealMatrix inCounts = factorNormalized.counts();
final double[] columnMeans = GATKProtectedMathUtils.columnMeans(inCounts);
Assert.assertTrue(DoubleStream.of(columnMeans).allMatch(d -> d < 0.5));
final double[][] expected = new double[inCounts.getRowDimension()][inCounts.getColumnDimension()];
final double[] columnValues = new double[inCounts.getRowDimension()];
for (int i = 0; i < columnMeans.length; i++) {
for (int j = 0; j < inCounts.getRowDimension(); j++) {
final double inValue = inCounts.getEntry(j, i);
final double lowBoundedInValue = Math.max(epsilon, inValue / columnMeans[i]);
final double outValue = Math.log(lowBoundedInValue) / Math.log(2);
expected[j][i] = outValue;
columnValues[j] = outValue;
}
Arrays.sort(columnValues);
final int midIndex = columnValues.length >> 1;
final double median = columnValues.length % 2 == 1 ? columnValues[midIndex] : (columnValues[midIndex] + columnValues[1 + midIndex]) * .5;
for (int j = 0; j < inCounts.getRowDimension(); j++) {
expected[j][i] -= median;
Assert.assertEquals(outCounts.getEntry(j, i), expected[j][i], 0.000001, " Row " + j + " col " + i);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class CalculateContamination method findConfidentHomAltSites.
private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
if (sites.isEmpty()) {
return new ArrayList<>();
}
final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
final List<Double> smoothedCopyRatios = new ArrayList<>();
final List<Double> hetRatios = new ArrayList<>();
for (final PileupSummary site : sites) {
final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
smoothedCopyRatios.add(averageNearbyCopyRatio);
final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
hetRatios.add(hetRatio);
}
final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
//TODO: as extra security, filter out sites that are near too many hom alts
logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
final Set<Integer> badSites = new TreeSet<>();
badSites.addAll(indicesWithAnomalousCopyRatio);
badSites.addAll(indicesWithLossOfHeterozygosity);
return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
Aggregations