Search in sources :

Example 6 with Percentile

use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.

the class ReadCountCollectionUtilsUnitTest method testExtremeMedianColumnsData.

@Test(dataProvider = "readCountAndPercentileData")
public void testExtremeMedianColumnsData(final ReadCountCollection readCount, final double percentile) {
    final Median median = new Median();
    final RealMatrix counts = readCount.counts();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double top = new Percentile(100 - percentile).evaluate(columnMedians);
    final double bottom = new Percentile(percentile).evaluate(columnMedians);
    final Boolean[] toBeKept = DoubleStream.of(columnMedians).mapToObj(d -> d <= top && d >= bottom).toArray(Boolean[]::new);
    final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
    final ReadCountCollection result = ReadCountCollectionUtils.removeColumnsWithExtremeMedianCounts(readCount, percentile, NULL_LOGGER);
    Assert.assertEquals(result.columnNames().size(), toBeKeptCount);
    int nextIndex = 0;
    for (int i = 0; i < toBeKept.length; i++) {
        if (toBeKept[i]) {
            int index = result.columnNames().indexOf(readCount.columnNames().get(i));
            Assert.assertEquals(index, nextIndex++);
            Assert.assertEquals(counts.getColumn(i), result.counts().getColumn(index));
        } else {
            Assert.assertEquals(result.columnNames().indexOf(readCount.columnNames().get(i)), -1);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) Arrays(java.util.Arrays) DataProvider(org.testng.annotations.DataProvider) Level(org.apache.logging.log4j.Level) Test(org.testng.annotations.Test) Random(java.util.Random) ArrayList(java.util.ArrayList) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) PrintWriter(java.io.PrintWriter) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOException(java.io.IOException) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) Stream(java.util.stream.Stream) UserException(org.broadinstitute.hellbender.exceptions.UserException) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Test(org.testng.annotations.Test)

Example 7 with Percentile

use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method readCountAndPercentileData.

// this is duplicated from ReadCountCollectionUtilsUnitTest
@DataProvider(name = "readCountAndPercentileData")
public Object[][] readCountAndPercentileData() {
    final double[] percentiles = new double[] { 1.0, 2.5, 5.0, 10.0, 25.0 };
    final List<Object[]> result = new ArrayList<>();
    final Random rdn = new Random(13);
    final int columnCount = 100;
    final int targetCount = 100;
    final List<String> columnNames = IntStream.range(0, columnCount).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
    final List<Target> targets = IntStream.range(0, targetCount).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
    for (final double percentile : percentiles) {
        final double[][] counts = new double[columnCount][targetCount];
        for (int i = 0; i < counts.length; i++) {
            for (int j = 0; j < counts[0].length; j++) {
                counts[i][j] = rdn.nextDouble();
            }
        }
        final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
        result.add(new Object[] { readCounts, percentile });
    }
    return result.toArray(new Object[result.size()][]);
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) ArrayList(java.util.ArrayList) Target(org.broadinstitute.hellbender.tools.exome.Target) Random(java.util.Random) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) DataProvider(org.testng.annotations.DataProvider)

Example 8 with Percentile

use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method readCountAndPercentileData.

// this is duplicated from ReadCountCollectionUtilsUnitTest
@DataProvider(name = "readCountAndPercentileData")
public Object[][] readCountAndPercentileData() {
    final double[] percentiles = new double[] { 1.0, 2.5, 5.0, 10.0, 25.0 };
    final List<Object[]> result = new ArrayList<>();
    final Random rdn = new Random(13);
    final int columnCount = 100;
    final int targetCount = 100;
    final List<String> columnNames = IntStream.range(0, columnCount).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
    final List<Target> targets = IntStream.range(0, targetCount).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
    for (final double percentile : percentiles) {
        final double[][] counts = new double[columnCount][targetCount];
        for (int i = 0; i < counts.length; i++) {
            for (int j = 0; j < counts[0].length; j++) {
                counts[i][j] = rdn.nextDouble();
            }
        }
        final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
        result.add(new Object[] { readCounts, percentile });
    }
    return result.toArray(new Object[result.size()][]);
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) ArrayList(java.util.ArrayList) Target(org.broadinstitute.hellbender.tools.exome.Target) Random(java.util.Random) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) DataProvider(org.testng.annotations.DataProvider)

Example 9 with Percentile

use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk by broadinstitute.

the class ReadCountCollectionUtils method truncateExtremeCounts.

/**
     * Truncates the extreme count values in the input read-count collection.
     * Values are forced to be bound by the percentile indicated with the input {@code percentile} which must be
     * in the range [0 .. 50.0]. Values under that percentile and the complementary (1 - percentile) are set to the
     * corresponding threshold value.
     *
     * <p>The imputation is done in-place, thus the input matrix is modified as a result of this call.</p>
     *
     * @param readCounts the input and output read-count matrix.
     */
public static void truncateExtremeCounts(final ReadCountCollection readCounts, final double percentile, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final int targetCount = counts.getRowDimension();
    final int columnCount = counts.getColumnDimension();
    // Create a row major array of the counts.
    final double[] values = Doubles.concat(counts.getData());
    final Percentile bottomPercentileEvaluator = new Percentile(percentile);
    final Percentile topPercentileEvaluator = new Percentile(100.0 - percentile);
    final double bottomPercentileThreshold = bottomPercentileEvaluator.evaluate(values);
    final double topPercentileThreshold = topPercentileEvaluator.evaluate(values);
    long totalCounts = 0;
    long bottomTruncatedCounts = 0;
    long topTruncatedCounts = 0;
    for (int i = 0; i < targetCount; i++) {
        final double[] rowCounts = counts.getRow(i);
        for (int j = 0; j < columnCount; j++) {
            final double count = rowCounts[j];
            totalCounts++;
            if (count < bottomPercentileThreshold) {
                counts.setEntry(i, j, bottomPercentileThreshold);
                bottomTruncatedCounts++;
            } else if (count > topPercentileThreshold) {
                counts.setEntry(i, j, topPercentileThreshold);
                topTruncatedCounts++;
            }
        }
    }
    if (topTruncatedCounts == 0 && bottomTruncatedCounts == 0) {
        logger.info(String.format("None of the %d counts were truncated as they all fall in the non-extreme range " + "[%.2f, %.2f]", totalCounts, bottomPercentileThreshold, topPercentileThreshold));
    } else {
        final double truncatedPercentage = ((double) (topTruncatedCounts + bottomTruncatedCounts) / totalCounts) * 100;
        logger.info(String.format("Some counts (%d out of %d, %.2f%%) were truncated as they fall out of the " + "non-extreme range [%.2f, %.2f]", topTruncatedCounts + bottomTruncatedCounts, totalCounts, truncatedPercentage, bottomPercentileThreshold, topPercentileThreshold));
    }
}
Also used : Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix)

Example 10 with Percentile

use of org.apache.commons.math3.stat.descriptive.rank.Percentile in project gatk-protected by broadinstitute.

the class ReadCountCollectionUtils method truncateExtremeCounts.

/**
     * Truncates the extreme count values in the input read-count collection.
     * Values are forced to be bound by the percentile indicated with the input {@code percentile} which must be
     * in the range [0 .. 50.0]. Values under that percentile and the complementary (1 - percentile) are set to the
     * corresponding threshold value.
     *
     * <p>The imputation is done in-place, thus the input matrix is modified as a result of this call.</p>
     *
     * @param readCounts the input and output read-count matrix.
     */
public static void truncateExtremeCounts(final ReadCountCollection readCounts, final double percentile, final Logger logger) {
    final RealMatrix counts = readCounts.counts();
    final int targetCount = counts.getRowDimension();
    final int columnCount = counts.getColumnDimension();
    // Create a row major array of the counts.
    final double[] values = Doubles.concat(counts.getData());
    final Percentile bottomPercentileEvaluator = new Percentile(percentile);
    final Percentile topPercentileEvaluator = new Percentile(100.0 - percentile);
    final double bottomPercentileThreshold = bottomPercentileEvaluator.evaluate(values);
    final double topPercentileThreshold = topPercentileEvaluator.evaluate(values);
    long totalCounts = 0;
    long bottomTruncatedCounts = 0;
    long topTruncatedCounts = 0;
    for (int i = 0; i < targetCount; i++) {
        final double[] rowCounts = counts.getRow(i);
        for (int j = 0; j < columnCount; j++) {
            final double count = rowCounts[j];
            totalCounts++;
            if (count < bottomPercentileThreshold) {
                counts.setEntry(i, j, bottomPercentileThreshold);
                bottomTruncatedCounts++;
            } else if (count > topPercentileThreshold) {
                counts.setEntry(i, j, topPercentileThreshold);
                topTruncatedCounts++;
            }
        }
    }
    if (topTruncatedCounts == 0 && bottomTruncatedCounts == 0) {
        logger.info(String.format("None of the %d counts were truncated as they all fall in the non-extreme range " + "[%.2f, %.2f]", totalCounts, bottomPercentileThreshold, topPercentileThreshold));
    } else {
        final double truncatedPercentage = ((double) (topTruncatedCounts + bottomTruncatedCounts) / totalCounts) * 100;
        logger.info(String.format("Some counts (%d out of %d, %.2f%%) were truncated as they fall out of the " + "non-extreme range [%.2f, %.2f]", topTruncatedCounts + bottomTruncatedCounts, totalCounts, truncatedPercentage, bottomPercentileThreshold, topPercentileThreshold));
    }
}
Also used : Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix)

Aggregations

Percentile (org.apache.commons.math3.stat.descriptive.rank.Percentile)31 ArrayList (java.util.ArrayList)16 RealMatrix (org.apache.commons.math3.linear.RealMatrix)16 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)14 List (java.util.List)11 Collectors (java.util.stream.Collectors)11 IntStream (java.util.stream.IntStream)11 File (java.io.File)10 DoubleStream (java.util.stream.DoubleStream)10 Median (org.apache.commons.math3.stat.descriptive.rank.Median)10 Logger (org.apache.logging.log4j.Logger)10 Test (org.testng.annotations.Test)10 Random (java.util.Random)9 Stream (java.util.stream.Stream)9 DescriptiveStatistics (org.apache.commons.math3.stat.descriptive.DescriptiveStatistics)9 Level (org.apache.logging.log4j.Level)8 Marker (org.apache.logging.log4j.Marker)8 Message (org.apache.logging.log4j.message.Message)8 AbstractLogger (org.apache.logging.log4j.spi.AbstractLogger)8 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)8