use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubtractMedianOfMedians.
@Test(dataProvider = "readCountOnlyData")
public void testSubtractMedianOfMedians(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double center = median.evaluate(columnMedians);
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] -= center;
}
}
HDF5PCACoveragePoNCreationUtils.subtractMedianOfMedians(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class NormalizeSomaticReadCountsIntegrationTest method assertPreTangentNormalizedValues.
/**
* This code reconstructs the expected pre-tangent normalization counts given the input,
* and then compares against the actual pre-tangent output.
* <p>
* This method does not use any of the components that does the actual computation
* in production, so that the calculation is independent.
* </p>
* <p>
* This code also use an alternative way to calculate it to reduce overlap even further.
* </p>
* @param preTangentNormalized actual output.
* @param factorNormalized input.
*/
private void assertPreTangentNormalizedValues(final ReadCountCollection factorNormalized, final ReadCountCollection preTangentNormalized) {
final double epsilon = PCATangentNormalizationUtils.EPSILON;
final RealMatrix outCounts = preTangentNormalized.counts();
final RealMatrix inCounts = factorNormalized.counts();
final double[] columnMeans = GATKProtectedMathUtils.columnMeans(inCounts);
Assert.assertTrue(DoubleStream.of(columnMeans).allMatch(d -> d < 0.5));
final double[][] expected = new double[inCounts.getRowDimension()][inCounts.getColumnDimension()];
final double[] columnValues = new double[inCounts.getRowDimension()];
for (int i = 0; i < columnMeans.length; i++) {
for (int j = 0; j < inCounts.getRowDimension(); j++) {
final double inValue = inCounts.getEntry(j, i);
final double lowBoundedInValue = Math.max(epsilon, inValue / columnMeans[i]);
final double outValue = Math.log(lowBoundedInValue) / Math.log(2);
expected[j][i] = outValue;
columnValues[j] = outValue;
}
Arrays.sort(columnValues);
final int midIndex = columnValues.length >> 1;
final double median = columnValues.length % 2 == 1 ? columnValues[midIndex] : (columnValues[midIndex] + columnValues[1 + midIndex]) * .5;
for (int j = 0; j < inCounts.getRowDimension(); j++) {
expected[j][i] -= median;
Assert.assertEquals(outCounts.getEntry(j, i), expected[j][i], 0.000001, " Row " + j + " col " + i);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubtractMedianOfMedians.
@Test(dataProvider = "readCountOnlyData")
public void testSubtractMedianOfMedians(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double center = median.evaluate(columnMedians);
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] -= center;
}
}
HDF5PCACoveragePoNCreationUtils.subtractMedianOfMedians(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class CalculateContamination method findConfidentHomAltSites.
private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
if (sites.isEmpty()) {
return new ArrayList<>();
}
final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
final List<Double> smoothedCopyRatios = new ArrayList<>();
final List<Double> hetRatios = new ArrayList<>();
for (final PileupSummary site : sites) {
final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
smoothedCopyRatios.add(averageNearbyCopyRatio);
final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
hetRatios.add(hetRatio);
}
final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
//TODO: as extra security, filter out sites that are near too many hom alts
logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
final Set<Integer> badSites = new TreeSet<>();
badSites.addAll(indicesWithAnomalousCopyRatio);
badSites.addAll(indicesWithLossOfHeterozygosity);
return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method subsetReadCountsToUsableTargets.
/**
* Subsets targets in the input count to the usable ones based on the percentile threshold indicated
* by the user.
*
* <p>
* It returns a pair of object, where the left one is the updated read-counts with only the usable
* targets, and the right one is the corresponding target factors.
* </p>
*
* @param readCounts the input read-counts.
* @param targetFactorPercentileThreshold the minimum median count percentile under which targets are not considered useful.
* @return never {@code null}.
*/
@VisibleForTesting
static Pair<ReadCountCollection, double[]> subsetReadCountsToUsableTargets(final ReadCountCollection readCounts, final double targetFactorPercentileThreshold, final Logger logger) {
final double[] targetFactors = calculateTargetFactors(readCounts);
final double threshold = new Percentile(targetFactorPercentileThreshold).evaluate(targetFactors);
final List<Target> targetByIndex = readCounts.targets();
final Set<Target> result = IntStream.range(0, targetFactors.length).filter(i -> targetFactors[i] >= threshold).mapToObj(targetByIndex::get).collect(Collectors.toCollection(LinkedHashSet::new));
if (result.size() == targetByIndex.size()) {
logger.info(String.format("All %d targets are kept", targetByIndex.size()));
return new ImmutablePair<>(readCounts, targetFactors);
} else {
final int discardedCount = targetFactors.length - result.size();
logger.info(String.format("Discarded %d target(s) out of %d with factors below %.2g (%.2f percentile)", discardedCount, targetFactors.length, threshold, targetFactorPercentileThreshold));
final double[] targetFactorSubset = DoubleStream.of(targetFactors).filter(i -> i >= threshold).toArray();
return new ImmutablePair<>(readCounts.subsetTargets(result), targetFactorSubset);
}
}
Aggregations