Search in sources :

Example 51 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsAmpsWithSpark.

@Test
public void testIdentifySamplesWithSuspiciousContigsAmpsWithSpark() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Set<String> gtBlacklistSamples = new HashSet<>();
    gtBlacklistSamples.add("sample_1");
    gtBlacklistSamples.add("sample_2");
    gtBlacklistSamples.add("sample_3");
    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_AMP);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_FILE_AMP, ioe);
    }
    final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);
    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
    final Set<String> resultSamples = new HashSet<>(blacklistSamples);
    Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
    Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}
Also used : ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IOException(java.io.IOException) HashSet(java.util.HashSet) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 52 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsDels.

@Test
public void testIdentifySamplesWithSuspiciousContigsDels() {
    final Set<String> gtBlacklistSamples = new HashSet<>();
    gtBlacklistSamples.add("sample_1");
    gtBlacklistSamples.add("sample_2");
    gtBlacklistSamples.add("sample_3");
    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_DEL);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_FILE_DEL, ioe);
    }
    final List<ReadCountCollection> singleSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createIndividualReadCountCollections(allCoverageProfiles);
    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(singleSampleTangentNormalizedReadCounts, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
    final Set<String> resultSamples = new HashSet<>(blacklistSamples);
    Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
    Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}
Also used : ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) IOException(java.io.IOException) HashSet(java.util.HashSet) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 53 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testNormalizeAndLogReadCounts.

@Test(dataProvider = "readCountOnlyData")
public void testNormalizeAndLogReadCounts(final ReadCountCollection readCounts) {
    final RealMatrix counts = readCounts.counts();
    final Median median = new Median();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double epsilon = HDF5PCACoveragePoNCreationUtils.EPSILON;
    final double[][] expected = new double[counts.getRowDimension()][];
    for (int i = 0; i < expected.length; i++) {
        expected[i] = counts.getRow(i).clone();
        for (int j = 0; j < expected[i].length; j++) {
            expected[i][j] /= columnMedians[j];
            if (expected[i][j] < epsilon) {
                expected[i][j] = epsilon;
            }
            expected[i][j] = Math.log(expected[i][j]) / Math.log(2);
        }
    }
    HDF5PCACoveragePoNCreationUtils.normalizeAndLogReadCounts(readCounts, NULL_LOGGER);
    final RealMatrix newCounts = readCounts.counts();
    Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
    Assert.assertEquals(newCounts.getRowDimension(), expected.length);
    for (int i = 0; i < expected.length; i++) {
        for (int j = 0; j < expected[i].length; j++) {
            Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Median(org.apache.commons.math3.stat.descriptive.rank.Median) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 54 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method simpleEigensampleData.

@DataProvider(name = "singleEigensample")
public Object[][] simpleEigensampleData() {
    final List<Object[]> result = new ArrayList<>();
    final int NUM_TARGETS = 10;
    final int NUM_SAMPLES = 5;
    final List<Target> targets = IntStream.range(0, NUM_TARGETS).boxed().map(i -> new Target("target_" + i, new SimpleInterval("1", 100 * i + 1, 100 * i + 5))).collect(Collectors.toList());
    final List<String> columnNames = IntStream.range(0, NUM_SAMPLES).boxed().map(i -> "sample_" + i).collect(Collectors.toList());
    double[][] countsArray = new double[NUM_TARGETS][NUM_SAMPLES];
    final RealMatrix counts = new Array2DRowRealMatrix(countsArray);
    // All row data is the same (0,1,2,3,4...)
    final double[] rowData = IntStream.range(0, NUM_SAMPLES).boxed().mapToDouble(i -> i).toArray();
    for (int i = 0; i < NUM_TARGETS; i++) {
        counts.setRow(i, rowData);
    }
    new ReadCountCollection(targets, columnNames, counts);
    result.add(new Object[] { new ReadCountCollection(targets, columnNames, counts) });
    return result.toArray(new Object[result.size()][]);
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) ArrayList(java.util.ArrayList) Target(org.broadinstitute.hellbender.tools.exome.Target) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) DataProvider(org.testng.annotations.DataProvider)

Example 55 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.

@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
    final Median median = new Median();
    final RealMatrix counts = readCount.counts();
    final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
    final double threshold = new Percentile(percentile).evaluate(targetMedians);
    final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
    final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
    final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
    Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
    Assert.assertEquals(result.getRight().length, toBeKeptCount);
    int nextIndex = 0;
    for (int i = 0; i < toBeKept.length; i++) {
        if (toBeKept[i]) {
            int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
            Assert.assertEquals(index, nextIndex++);
            Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
            Assert.assertEquals(result.getRight()[index], targetMedians[i]);
        } else {
            Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)74 Test (org.testng.annotations.Test)48 Target (org.broadinstitute.hellbender.tools.exome.Target)40 File (java.io.File)30 IOException (java.io.IOException)30 Collectors (java.util.stream.Collectors)30 List (java.util.List)28 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)28 IntStream (java.util.stream.IntStream)26 Assert (org.testng.Assert)26 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)24 RealMatrix (org.apache.commons.math3.linear.RealMatrix)22 Median (org.apache.commons.math3.stat.descriptive.rank.Median)22 ArrayList (java.util.ArrayList)20 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)20 Logger (org.apache.logging.log4j.Logger)20 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)20 Mean (org.apache.commons.math3.stat.descriptive.moment.Mean)18 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 DoubleStream (java.util.stream.DoubleStream)16