use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class CoveragePoNQCUtils method hasSuspiciousContigs.
/**
* Given a single sample tangent normalization (or other coverage profile), determine whether any contig looks like
* it has an arm level event (defined as 25% (or more) of the contig amplified/deleted)
*
* @param singleSampleTangentNormalized Tangent normalized data for a single sample.
* @return never {@code null}
*/
private static Boolean hasSuspiciousContigs(final ReadCountCollection singleSampleTangentNormalized, final Map<String, Double> contigToMedian) {
final List<String> allContigsPresent = retrieveAllContigsPresent(singleSampleTangentNormalized);
for (String contig : allContigsPresent) {
final ReadCountCollection oneContigReadCountCollection = singleSampleTangentNormalized.subsetTargets(singleSampleTangentNormalized.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
final RealVector counts = oneContigReadCountCollection.counts().getColumnVector(0);
for (int i = 0; i < 4; i++) {
final RealVector partitionCounts = counts.getSubVector(i * counts.getDimension() / 4, counts.getDimension() / 4);
final double[] partitionArray = DoubleStream.of(partitionCounts.toArray()).map(d -> Math.pow(2, d)).sorted().toArray();
double median = new Median().evaluate(partitionArray);
final double medianShiftInCRSpace = contigToMedian.getOrDefault(contig, 1.0) - 1.0;
median -= medianShiftInCRSpace;
if ((median > AMP_THRESHOLD) || (median < DEL_THRESHOLD)) {
logger.info("Suspicious contig: " + singleSampleTangentNormalized.columnNames().get(0) + " " + contig + " (" + median + " -- " + i + ")");
return true;
}
}
}
return false;
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class CoveragePoNQCUtils method getContigToMedianCRMap.
@VisibleForTesting
static Map<String, Double> getContigToMedianCRMap(final ReadCountCollection readCountCollection) {
final List<String> allContigsPresent = retrieveAllContigsPresent(readCountCollection);
final Map<String, Double> contigToMedian = new LinkedHashMap<>();
for (String contig : allContigsPresent) {
final ReadCountCollection oneContigReadCountCollection = readCountCollection.subsetTargets(readCountCollection.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
final double[] flatCounts = Doubles.concat(oneContigReadCountCollection.counts().getData());
// Put into CRSpace
final double[] flatCountsInCRSpace = DoubleStream.of(flatCounts).map(d -> Math.pow(2, d)).toArray();
contigToMedian.put(contig, new Median().evaluate(flatCountsInCRSpace));
}
return contigToMedian;
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method subsetReadCountsToUsableTargets.
/**
* Subsets targets in the input count to the usable ones based on the percentile threshold indicated
* by the user.
*
* <p>
* It returns a pair of object, where the left one is the updated read-counts with only the usable
* targets, and the right one is the corresponding target factors.
* </p>
*
* @param readCounts the input read-counts.
* @param targetFactorPercentileThreshold the minimum median count percentile under which targets are not considered useful.
* @return never {@code null}.
*/
@VisibleForTesting
static Pair<ReadCountCollection, double[]> subsetReadCountsToUsableTargets(final ReadCountCollection readCounts, final double targetFactorPercentileThreshold, final Logger logger) {
final double[] targetFactors = calculateTargetFactors(readCounts);
final double threshold = new Percentile(targetFactorPercentileThreshold).evaluate(targetFactors);
final List<Target> targetByIndex = readCounts.targets();
final Set<Target> result = IntStream.range(0, targetFactors.length).filter(i -> targetFactors[i] >= threshold).mapToObj(targetByIndex::get).collect(Collectors.toCollection(LinkedHashSet::new));
if (result.size() == targetByIndex.size()) {
logger.info(String.format("All %d targets are kept", targetByIndex.size()));
return new ImmutablePair<>(readCounts, targetFactors);
} else {
final int discardedCount = targetFactors.length - result.size();
logger.info(String.format("Discarded %d target(s) out of %d with factors below %.2g (%.2f percentile)", discardedCount, targetFactors.length, threshold, targetFactorPercentileThreshold));
final double[] targetFactorSubset = DoubleStream.of(targetFactors).filter(i -> i >= threshold).toArray();
return new ImmutablePair<>(readCounts.subsetTargets(result), targetFactorSubset);
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method normalizeAndLogReadCounts.
/**
* Final pre-panel normalization that consists of dividing all counts by the median of
* its column and log it with base 2.
* <p>
* The normalization occurs in-place.
* </p>
*
* @param readCounts the input counts to normalize.
*/
@VisibleForTesting
static void normalizeAndLogReadCounts(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final Median medianCalculator = new Median();
final double[] medians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(col -> medianCalculator.evaluate(counts.getColumn(col))).toArray();
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return Math.log(Math.max(EPSILON, value / medians[column])) * INV_LN_2;
}
});
logger.info("Counts normalized by the column median and log2'd.");
}
use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method subtractMedianOfMedians.
/**
* Calculates the median of column medians and subtract it from all counts.
* @param readCounts the input counts to center.
* @return the median of medians that has been subtracted from all counts.
*/
@VisibleForTesting
static double subtractMedianOfMedians(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final Median medianCalculator = new Median();
final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
final double medianOfMedians = medianCalculator.evaluate(columnMedians);
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value - medianOfMedians;
}
});
logger.info(String.format("Counts centered around the median of medians %.2f", medianOfMedians));
return medianOfMedians;
}
Aggregations