use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class IntegerCopyNumberTransitionProbabilityCacheUnitTest method testStationaryProbabilities.
@Test
public void testStationaryProbabilities() {
for (final int padding : PADDINGS) {
for (final RealMatrix transitionMatrix : TRANSITION_MATRICES) {
final IntegerCopyNumberTransitionProbabilityCache cache = new IntegerCopyNumberTransitionProbabilityCache(new IntegerCopyNumberTransitionMatrix(transitionMatrix, padding));
final double[] stationary = cache.getStationaryProbabilityVector().toArray();
final double[] stationaryFromMultiplication = cache.getTransitionProbabilityMatrix(Integer.MAX_VALUE).getColumn(0);
ArrayAsserts.assertArrayEquals(stationary, stationaryFromMultiplication, EPSILON);
Assert.assertEquals(Arrays.stream(stationary).sum(), 1.0, EPSILON);
}
}
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk-protected by broadinstitute.
the class TestHMM method fromPhredProbabilities.
static TestHMM fromPhredProbabilities(final double... phred) {
final double[] priors = Arrays.copyOfRange(phred, 0, State.values().length);
final double[][] transition = reshape(Arrays.copyOfRange(phred, State.values().length, State.values().length * (State.values().length + 1)), State.values().length);
final double[][] emission = reshape(Arrays.copyOfRange(phred, (State.values().length + 1) * State.values().length, phred.length), State.values().length);
final DoubleUnaryOperator phredToLog = d -> d * -.1 * LN_OF_10;
final double[] logPriorsRaw = DoubleStream.of(priors).map(phredToLog).toArray();
final double logPriorsSum = GATKProtectedMathUtils.logSumExp(logPriorsRaw);
final double[] logPriors = DoubleStream.of(logPriorsRaw).map(d -> d - logPriorsSum).toArray();
final double[][] logEmissionProbs = Stream.of(emission).map(x -> {
final double[] result = DoubleStream.of(x).map(phredToLog).toArray();
final double sum = GATKProtectedMathUtils.logSumExp(result);
return DoubleStream.of(result).map(d -> d - sum).toArray();
}).toArray(double[][]::new);
final double[][] logTransitionProbs = Stream.of(transition).map(x -> {
final double[] result = DoubleStream.of(x).map(phredToLog).toArray();
final double sum = GATKProtectedMathUtils.logSumExp(result);
return DoubleStream.of(result).map(d -> d - sum).toArray();
}).toArray(double[][]::new);
return new TestHMM(logPriors, logEmissionProbs, logTransitionProbs);
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.
the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.
/**
* Impute zero counts to the median of non-zero values in the enclosing target row.
*
* <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
*
* @param readCounts the input and output read-count matrix.
*/
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
final RealMatrix counts = readCounts.counts();
final int targetCount = counts.getRowDimension();
final Median medianCalculator = new Median();
int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
// Get the number of zeroes contained in the counts.
final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
// Get the median of each row, not including any entries that are zero.
final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
// Change any zeros in the counts to the median for the row.
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int row, final int column, final double value) {
return value != 0 ? value : medians[row];
}
});
if (totalZeroCounts > 0) {
final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
} else {
logger.info("No count is 0.0 thus no count needed to be imputed.");
}
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.
the class CalculateContamination method findConfidentHomAltSites.
private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
if (sites.isEmpty()) {
return new ArrayList<>();
}
final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
final List<Double> smoothedCopyRatios = new ArrayList<>();
final List<Double> hetRatios = new ArrayList<>();
for (final PileupSummary site : sites) {
final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
smoothedCopyRatios.add(averageNearbyCopyRatio);
final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
hetRatios.add(hetRatio);
}
final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
//TODO: as extra security, filter out sites that are near too many hom alts
logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
final Set<Integer> badSites = new TreeSet<>();
badSites.addAll(indicesWithAnomalousCopyRatio);
badSites.addAll(indicesWithLossOfHeterozygosity);
return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
use of org.apache.commons.math3.stat.descriptive.summary.Sum in project gatk by broadinstitute.
the class AlleleFrequencyCalculator method effectiveAlleleCounts.
// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
final int numAlleles = vc.getNAlleles();
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
final double[] log10Result = new double[numAlleles];
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) {
continue;
}
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex -> glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) -> log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.log10(count))));
}
return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
Aggregations