Search in sources :

Example 41 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.

the class ReadCountCollectionUtils method imputeZeroCountsAsTargetMedians.

/**
     * Impute zero counts to the median of non-zero values in the enclosing target row.
     *
     * <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p>
     *
     * @param readCounts the input and output read-count matrix.
     */
public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final int targetCount = counts.getRowDimension();
    final Median medianCalculator = new Median();
    int totalCounts = counts.getColumnDimension() * counts.getRowDimension();
    // Get the number of zeroes contained in the counts.
    final long totalZeroCounts = IntStream.range(0, targetCount).mapToLong(t -> DoubleStream.of(counts.getRow(t)).filter(c -> c == 0.0).count()).sum();
    // Get the median of each row, not including any entries that are zero.
    final double[] medians = IntStream.range(0, targetCount).mapToDouble(t -> medianCalculator.evaluate(DoubleStream.of(counts.getRow(t)).filter(c -> c != 0.0).toArray())).toArray();
    // Change any zeros in the counts to the median for the row.
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(final int row, final int column, final double value) {
            return value != 0 ? value : medians[row];
        }
    });
    if (totalZeroCounts > 0) {
        final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts);
        logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage));
    } else {
        logger.info("No count is 0.0 thus no count needed to be imputed.");
    }
}
Also used : IntStream(java.util.stream.IntStream) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) java.util(java.util) TableReader(org.broadinstitute.hellbender.utils.tsv.TableReader) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) StringUtils(org.apache.commons.lang3.StringUtils) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) SampleNameFinder(org.broadinstitute.hellbender.tools.exome.samplenamefinder.SampleNameFinder) DataLine(org.broadinstitute.hellbender.utils.tsv.DataLine) Median(org.apache.commons.math3.stat.descriptive.rank.Median) TableColumnCollection(org.broadinstitute.hellbender.utils.tsv.TableColumnCollection) Nonnull(javax.annotation.Nonnull) Locatable(htsjdk.samtools.util.Locatable) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) Predicate(java.util.function.Predicate) FastMath(org.apache.commons.math3.util.FastMath) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) DoubleStream(java.util.stream.DoubleStream) TableWriter(org.broadinstitute.hellbender.utils.tsv.TableWriter) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) SetUniqueList(org.apache.commons.collections4.list.SetUniqueList) Doubles(com.google.common.primitives.Doubles) Utils(org.broadinstitute.hellbender.utils.Utils) RealMatrix(org.apache.commons.math3.linear.RealMatrix) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) Median(org.apache.commons.math3.stat.descriptive.rank.Median)

Example 42 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testNormalizeAndLogReadCounts.

@Test(dataProvider = "readCountOnlyData")
public void testNormalizeAndLogReadCounts(final ReadCountCollection readCounts) {
    final RealMatrix counts = readCounts.counts();
    final Median median = new Median();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double epsilon = HDF5PCACoveragePoNCreationUtils.EPSILON;
    final double[][] expected = new double[counts.getRowDimension()][];
    for (int i = 0; i < expected.length; i++) {
        expected[i] = counts.getRow(i).clone();
        for (int j = 0; j < expected[i].length; j++) {
            expected[i][j] /= columnMedians[j];
            if (expected[i][j] < epsilon) {
                expected[i][j] = epsilon;
            }
            expected[i][j] = Math.log(expected[i][j]) / Math.log(2);
        }
    }
    HDF5PCACoveragePoNCreationUtils.normalizeAndLogReadCounts(readCounts, NULL_LOGGER);
    final RealMatrix newCounts = readCounts.counts();
    Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
    Assert.assertEquals(newCounts.getRowDimension(), expected.length);
    for (int i = 0; i < expected.length; i++) {
        for (int j = 0; j < expected[i].length; j++) {
            Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Median(org.apache.commons.math3.stat.descriptive.rank.Median) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 43 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.

@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
    final Median median = new Median();
    final RealMatrix counts = readCount.counts();
    final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
    final double threshold = new Percentile(percentile).evaluate(targetMedians);
    final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
    final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
    final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
    Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
    Assert.assertEquals(result.getRight().length, toBeKeptCount);
    int nextIndex = 0;
    for (int i = 0; i < toBeKept.length; i++) {
        if (toBeKept[i]) {
            int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
            Assert.assertEquals(index, nextIndex++);
            Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
            Assert.assertEquals(result.getRight()[index], targetMedians[i]);
        } else {
            Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 44 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtils method normalizeAndLogReadCounts.

/**
     * Final pre-panel normalization that consists of dividing all counts by the median of
     * its column and log it with base 2.
     * <p>
     *     The normalization occurs in-place.
     * </p>
     *
     * @param readCounts the input counts to normalize.
     */
@VisibleForTesting
static void normalizeAndLogReadCounts(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final Median medianCalculator = new Median();
    final double[] medians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(col -> medianCalculator.evaluate(counts.getColumn(col))).toArray();
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(final int row, final int column, final double value) {
            return Math.log(Math.max(EPSILON, value / medians[column])) * INV_LN_2;
        }
    });
    logger.info("Counts normalized by the column median and log2'd.");
}
Also used : IntStream(java.util.stream.IntStream) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) SVD(org.broadinstitute.hellbender.utils.svd.SVD) java.util(java.util) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) Pair(org.apache.commons.lang3.tuple.Pair) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) org.broadinstitute.hellbender.tools.exome(org.broadinstitute.hellbender.tools.exome) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) MathUtils(org.broadinstitute.hellbender.utils.MathUtils) UserException(org.broadinstitute.hellbender.exceptions.UserException) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) Utils(org.broadinstitute.hellbender.utils.Utils) RealMatrix(org.apache.commons.math3.linear.RealMatrix) VisibleForTesting(com.google.common.annotations.VisibleForTesting) LogManager(org.apache.logging.log4j.LogManager) RealMatrix(org.apache.commons.math3.linear.RealMatrix) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) Median(org.apache.commons.math3.stat.descriptive.rank.Median) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 45 with Median

use of org.apache.commons.math3.stat.descriptive.rank.Median in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtils method subtractMedianOfMedians.

/**
     * Calculates the median of column medians and subtract it from all counts.
     * @param readCounts the input counts to center.
     * @return the median of medians that has been subtracted from all counts.
     */
@VisibleForTesting
static double subtractMedianOfMedians(final ReadCountCollection readCounts, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final Median medianCalculator = new Median();
    final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts);
    final double medianOfMedians = medianCalculator.evaluate(columnMedians);
    counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(final int row, final int column, final double value) {
            return value - medianOfMedians;
        }
    });
    logger.info(String.format("Counts centered around the median of medians %.2f", medianOfMedians));
    return medianOfMedians;
}
Also used : RealMatrix(org.apache.commons.math3.linear.RealMatrix) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) Median(org.apache.commons.math3.stat.descriptive.rank.Median) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

Median (org.apache.commons.math3.stat.descriptive.rank.Median)35 RealMatrix (org.apache.commons.math3.linear.RealMatrix)29 IntStream (java.util.stream.IntStream)28 Collectors (java.util.stream.Collectors)24 Logger (org.apache.logging.log4j.Logger)24 Percentile (org.apache.commons.math3.stat.descriptive.rank.Percentile)22 DoubleStream (java.util.stream.DoubleStream)20 File (java.io.File)18 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)17 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)16 List (java.util.List)15 ArrayList (java.util.ArrayList)14 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)14 UserException (org.broadinstitute.hellbender.exceptions.UserException)14 ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)14 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)14 VisibleForTesting (com.google.common.annotations.VisibleForTesting)13 java.util (java.util)13 DefaultRealMatrixChangingVisitor (org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor)12 LogManager (org.apache.logging.log4j.LogManager)12