Search in sources :

Example 31 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.

the class MatrixSummaryUtils method getColumnMedians.

/**
     * Return an array containing the median for each column in the given matrix.
     * @param m Not {@code null}.  Size MxN, where neither dimension is zero.  If any entry is NaN, it is disregarded
     *          in the calculation.
     * @return array of size N.  Never {@code null}
     */
public static double[] getColumnMedians(final RealMatrix m) {
    Utils.nonNull(m, "Cannot calculate medians on a null matrix.");
    final Median medianCalculator = new Median();
    return IntStream.range(0, m.getColumnDimension()).boxed().mapToDouble(i -> medianCalculator.evaluate(m.getColumn(i))).toArray();
}
Also used : IntStream(java.util.stream.IntStream) Median(org.apache.commons.math3.stat.descriptive.rank.Median) StandardDeviation(org.apache.commons.math3.stat.descriptive.moment.StandardDeviation) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Median(org.apache.commons.math3.stat.descriptive.rank.Median)

Example 32 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.

the class ReadCountCollectionUtils method removeColumnsWithExtremeMedianCounts.

/**
     * Creates a new read-count collection that is a copy of the input but dropping columns with extreme median coverage.
     *
     * @param readCounts the input read-counts.
     * @param extremeColumnMedianCountPercentileThreshold bottom percentile to use as an exclusion threshold.
     * @return never {@code null}. It might be a reference to the input read-counts if
     * there are no columns to be dropped.
     */
public static ReadCountCollection removeColumnsWithExtremeMedianCounts(final ReadCountCollection readCounts, final double extremeColumnMedianCountPercentileThreshold, final Logger logger) {
    final List<String> columnNames = readCounts.columnNames();
    final RealMatrix counts = readCounts.counts();
    final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
    // Calculate thresholds:
    final double bottomExtremeThreshold = new Percentile(extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
    final double topExtremeThreshold = new Percentile(100 - extremeColumnMedianCountPercentileThreshold).evaluate(columnMedians);
    // Determine kept and dropped column sets.
    final Set<String> columnsToKeep = new LinkedHashSet<>(readCounts.columnNames().size());
    final int initialMapSize = ((int) (2.0 * extremeColumnMedianCountPercentileThreshold / 100.0)) * readCounts.columnNames().size();
    final Map<String, Double> columnsToDrop = new LinkedHashMap<>(initialMapSize);
    for (int i = 0; i < columnMedians.length; i++) {
        if (columnMedians[i] >= bottomExtremeThreshold && columnMedians[i] <= topExtremeThreshold) {
            columnsToKeep.add(columnNames.get(i));
        } else {
            columnsToDrop.put(columnNames.get(i), columnMedians[i]);
        }
    }
    // log and drop columns if it applies
    if (columnsToKeep.isEmpty()) {
        throw new UserException.BadInput("No column count left after applying the extreme counts outlier filter");
    } else if (columnsToKeep.size() == columnNames.size()) {
        logger.info(String.format("No column dropped due to extreme counts outside [%.10f, %.10f]", bottomExtremeThreshold, topExtremeThreshold));
        return readCounts;
    } else {
        final double droppedPercentage = ((double) (columnsToDrop.size()) / columnNames.size()) * 100;
        logger.info(String.format("Some columns dropped (%d out of %d, %.2f%%) as they are classified as having extreme " + "median counts across targets outside [%.10f, %.10f]: e.g. %s", columnsToDrop.size(), columnNames.size(), droppedPercentage, bottomExtremeThreshold, topExtremeThreshold, columnsToDrop.entrySet().stream().limit(10).map(kv -> kv.getKey() + " (" + kv.getValue() + ")").collect(Collectors.joining(", "))));
        return readCounts.subsetColumns(columnsToKeep);
    }
}
Also used : Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix)

Example 33 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.

the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.

/**
     * Impute zero counts to the median of non-zero values in the enclosing target row.
     *
     * <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
     *
     * @param readCounts the input and output read-count matrix.
     */
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final int targetCount = counts.getRowDimension();
    final Median medianCalculator = new Median();
    int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
    // Get the number of zeroes contained in the counts.
    final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
    // Get the median of each row, not including any entries that are zero.
    final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
    // Change any zeros in the counts to the median for the row.
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(final int row, final int column, final double value) {
            return value != 0 ? value : medians[row];
        }
    });
    if (totalZeroCounts > 0) {
        final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
        logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
    } else {
        logger.info("No count is 0.0 thus no count needed to be imputed.");
    }
}
Also used : IntStream(java.util.stream.IntStream) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) java.util(java.util) TableReader(org.broadinstitute.hellbender.utils.tsv.TableReader) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) StringUtils(org.apache.commons.lang3.StringUtils) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) SampleNameFinder(org.broadinstitute.hellbender.tools.exome.samplenamefinder.SampleNameFinder) DataLine(org.broadinstitute.hellbender.utils.tsv.DataLine) Median(org.apache.commons.math3.stat.descriptive.rank.Median) TableColumnCollection(org.broadinstitute.hellbender.utils.tsv.TableColumnCollection) Nonnull(javax.annotation.Nonnull) Locatable(htsjdk.samtools.util.Locatable) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) Predicate(java.util.function.Predicate) FastMath(org.apache.commons.math3.util.FastMath) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) DoubleStream(java.util.stream.DoubleStream) TableWriter(org.broadinstitute.hellbender.utils.tsv.TableWriter) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) SetUniqueList(org.apache.commons.collections4.list.SetUniqueList) Doubles(com.google.common.primitives.Doubles) Utils(org.broadinstitute.hellbender.utils.Utils) RealMatrix(org.apache.commons.math3.linear.RealMatrix) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) Median(org.apache.commons.math3.stat.descriptive.rank.Median)

Example 34 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.

the class NormalizeSomaticReadCountsIntegrationTest method assertPreTangentNormalizedValues.

/**
     * This code reconstructs the expected pre-tangent normalization counts given the input,
     * and then compares against the actual pre-tangent output.
     * <p>
     * This method does not use any of the components that does the actual computation
     * in production, so that the calculation is independent.
     * </p>
     * <p>
     * This code also use an alternative way to calculate it to reduce overlap even further.
     * </p>
     * @param preTangentNormalized actual output.
     * @param factorNormalized input.
     */
private void assertPreTangentNormalizedValues(final ReadCountCollection factorNormalized, final ReadCountCollection preTangentNormalized) {
    final double epsilon = PCATangentNormalizationUtils.EPSILON;
    final RealMatrix outCounts = preTangentNormalized.counts();
    final RealMatrix inCounts = factorNormalized.counts();
    final double[] columnMeans = GATKProtectedMathUtils.columnMeans(inCounts);
    Assert.assertTrue(DoubleStream.of(columnMeans).allMatch(d -> d < 0.5));
    final double[][] expected = new double[inCounts.getRowDimension()][inCounts.getColumnDimension()];
    final double[] columnValues = new double[inCounts.getRowDimension()];
    for (int i = 0; i < columnMeans.length; i++) {
        for (int j = 0; j < inCounts.getRowDimension(); j++) {
            final double inValue = inCounts.getEntry(j, i);
            final double lowBoundedInValue = Math.max(epsilon, inValue / columnMeans[i]);
            final double outValue = Math.log(lowBoundedInValue) / Math.log(2);
            expected[j][i] = outValue;
            columnValues[j] = outValue;
        }
        Arrays.sort(columnValues);
        final int midIndex = columnValues.length >> 1;
        final double median = columnValues.length % 2 == 1 ? columnValues[midIndex] : (columnValues[midIndex] + columnValues[1 + midIndex]) * .5;
        for (int j = 0; j < inCounts.getRowDimension(); j++) {
            expected[j][i] -= median;
            Assert.assertEquals(outCounts.getEntry(j, i), expected[j][i], 0.000001, " Row " + j + " col " + i);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) java.util(java.util) TableReader(org.broadinstitute.hellbender.utils.tsv.TableReader) DataProvider(org.testng.annotations.DataProvider) DefaultRealMatrixPreservingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixPreservingVisitor) Test(org.testng.annotations.Test) Function(java.util.function.Function) PCACoveragePoN(org.broadinstitute.hellbender.tools.pon.coverage.pca.PCACoveragePoN) Assert(org.testng.Assert) TableUtils(org.broadinstitute.hellbender.utils.tsv.TableUtils) TableColumnCollection(org.broadinstitute.hellbender.utils.tsv.TableColumnCollection) HDF5File(org.broadinstitute.hdf5.HDF5File) ExomeStandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.ExomeStandardArgumentDefinitions) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) GATKProtectedMathUtils(org.broadinstitute.hellbender.utils.GATKProtectedMathUtils) PCATangentNormalizationUtils(org.broadinstitute.hellbender.tools.pon.coverage.pca.PCATangentNormalizationUtils) IOException(java.io.IOException) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) PCATangentNormalizationResult(org.broadinstitute.hellbender.tools.pon.coverage.pca.PCATangentNormalizationResult) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) HDF5PCACoveragePoN(org.broadinstitute.hellbender.tools.pon.coverage.pca.HDF5PCACoveragePoN) UserException(org.broadinstitute.hellbender.exceptions.UserException) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix)

Example 35 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk by broadinstitute.

the class CalculateContamination method findConfidentHomAltSites.

private static List<PileupSummary> findConfidentHomAltSites(List<PileupSummary> sites) {
    if (sites.isEmpty()) {
        return new ArrayList<>();
    }
    final TargetCollection<PileupSummary> tc = new HashedListTargetCollection<>(sites);
    final double averageCoverage = sites.stream().mapToInt(PileupSummary::getTotalCount).average().getAsDouble();
    final List<Double> smoothedCopyRatios = new ArrayList<>();
    final List<Double> hetRatios = new ArrayList<>();
    for (final PileupSummary site : sites) {
        final SimpleInterval nearbySpan = new SimpleInterval(site.getContig(), Math.max(1, site.getStart() - CNV_SCALE), site.getEnd() + CNV_SCALE);
        final List<PileupSummary> nearbySites = tc.targets(nearbySpan);
        final double averageNearbyCopyRatio = nearbySites.stream().mapToDouble(s -> s.getTotalCount() / averageCoverage).average().orElseGet(() -> 0);
        smoothedCopyRatios.add(averageNearbyCopyRatio);
        final double expectedNumberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAlleleFrequency).map(x -> 2 * x * (1 - x)).sum();
        final long numberOfNearbyHets = nearbySites.stream().mapToDouble(PileupSummary::getAltFraction).filter(x -> 0.4 < x && x < 0.6).count();
        final double hetRatio = numberOfNearbyHets / expectedNumberOfNearbyHets;
        hetRatios.add(hetRatio);
    }
    final double medianSmoothedCopyRatio = new Median().evaluate(smoothedCopyRatios.stream().mapToDouble(x -> x).toArray());
    final List<Integer> indicesWithAnomalousCopyRatio = IntStream.range(0, sites.size()).filter(n -> smoothedCopyRatios.get(n) < 0.8 * medianSmoothedCopyRatio || smoothedCopyRatios.get(n) > 2 * medianSmoothedCopyRatio).boxed().collect(Collectors.toList());
    final double meanHetRatio = hetRatios.stream().mapToDouble(x -> x).average().getAsDouble();
    final List<Integer> indicesWithLossOfHeterozygosity = IntStream.range(0, sites.size()).filter(n -> hetRatios.get(n) < meanHetRatio * 0.5).boxed().collect(Collectors.toList());
    //TODO: as extra security, filter out sites that are near too many hom alts
    logger.info(String.format("Excluding %d sites with low or high copy ratio and %d sites with potential loss of heterozygosity", indicesWithAnomalousCopyRatio.size(), indicesWithLossOfHeterozygosity.size()));
    logger.info(String.format("The average ratio of hets within distance %d to theoretically expected number of hets is %.3f", CNV_SCALE, meanHetRatio));
    final Set<Integer> badSites = new TreeSet<>();
    badSites.addAll(indicesWithAnomalousCopyRatio);
    badSites.addAll(indicesWithLossOfHeterozygosity);
    return IntStream.range(0, sites.size()).filter(n -> !badSites.contains(n)).mapToObj(sites::get).filter(s -> s.getAltFraction() > 0.8).collect(Collectors.toList());
}
Also used : IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CommandLineProgram(org.broadinstitute.hellbender.cmdline.CommandLineProgram) Argument(org.broadinstitute.barclay.argparser.Argument) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) VariantProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup) File(java.io.File) Logger(org.apache.logging.log4j.Logger) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) LogManager(org.apache.logging.log4j.LogManager) TargetCollection(org.broadinstitute.hellbender.tools.exome.TargetCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HashedListTargetCollection(org.broadinstitute.hellbender.tools.exome.HashedListTargetCollection) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

Median (org.apache.commons.math3.stat.descriptive.rank.Median)35 RealMatrix (org.apache.commons.math3.linear.RealMatrix)29 IntStream (java.util.stream.IntStream)28 Collectors (java.util.stream.Collectors)24 Logger (org.apache.logging.log4j.Logger)24 Percentile (org.apache.commons.math3.stat.descriptive.rank.Percentile)22 DoubleStream (java.util.stream.DoubleStream)20 File (java.io.File)18 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)17 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)16 List (java.util.List)15 ArrayList (java.util.ArrayList)14 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)14 UserException (org.broadinstitute.hellbender.exceptions.UserException)14 ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)14 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)14 VisibleForTesting (com.google.common.annotations.VisibleForTesting)13 java.util (java.util)13 DefaultRealMatrixChangingVisitor (org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor)12 LogManager (org.apache.logging.log4j.LogManager)12