Search in sources :

Example 66 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method readCountOnlyWithDiverseShapeData.

@DataProvider(name = "readCountOnlyWithDiverseShapeData")
public Object[][] readCountOnlyWithDiverseShapeData() {
    final List<Object[]> result = new ArrayList<>(4);
    final Random rdn = new Random(31);
    final int[] columnCounts = new int[] { 10, 100, 100, 200 };
    final int[] targetCounts = new int[] { 100, 100, 200, 200 };
    for (int k = 0; k < columnCounts.length; k++) {
        final List<String> columnNames = IntStream.range(0, columnCounts[k]).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
        final List<Target> targets = IntStream.range(0, targetCounts[k]).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
        final double[][] counts = new double[targetCounts[k]][columnCounts[k]];
        for (int i = 0; i < counts.length; i++) {
            for (int j = 0; j < counts[0].length; j++) {
                counts[i][j] = rdn.nextDouble();
            }
        }
        final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
        result.add(new Object[] { readCounts });
    }
    return result.toArray(new Object[result.size()][]);
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) ArrayList(java.util.ArrayList) Target(org.broadinstitute.hellbender.tools.exome.Target) Random(java.util.Random) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) DataProvider(org.testng.annotations.DataProvider)

Example 67 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class SparkGenomeReadCountsIntegrationTest method testSparkGenomeReadCountsInterval.

@Test
public void testSparkGenomeReadCountsInterval() {
    final File outputFile = createTempFile(BAM_FILE.getName(), ".cov");
    final String[] arguments = { "--disableSequenceDictionaryValidation", "-" + StandardArgumentDefinitions.REFERENCE_SHORT_NAME, REFERENCE_FILE.getAbsolutePath(), "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, BAM_FILE.getAbsolutePath(), "-" + SparkGenomeReadCounts.OUTPUT_FILE_SHORT_NAME, outputFile.getAbsolutePath(), "-" + SparkGenomeReadCounts.BINSIZE_SHORT_NAME, "10000", "-L", "1" };
    runCommandLine(arguments);
    final ReadCountCollection proportionalCoverage = loadReadCountCollection(outputFile);
    Assert.assertTrue(proportionalCoverage.records().stream().noneMatch(t -> t.getContig().equals("2") || t.getContig().equals("3")));
    // raw coverage
    final ReadCountCollection rawCoverage = loadReadCountCollection(new File(outputFile.getAbsolutePath() + SparkGenomeReadCounts.RAW_COV_OUTPUT_EXTENSION));
    Assert.assertTrue(rawCoverage.records().stream().noneMatch(t -> t.getContig().equals("2") || t.getContig().equals("3")));
    final File targetsFile = new File(outputFile.getAbsolutePath() + ".targets.tsv");
    final List<Target> targets = TargetTableReader.readTargetFile(targetsFile);
    Assert.assertTrue(targets.stream().allMatch(t -> t.getContig().equals("1")));
}
Also used : List(java.util.List) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Assert(org.testng.Assert) Target(org.broadinstitute.hellbender.tools.exome.Target) TargetTableReader(org.broadinstitute.hellbender.tools.exome.TargetTableReader) ReadCountCollectionUtils(org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) Test(org.testng.annotations.Test) IOException(java.io.IOException) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) File(java.io.File) Target(org.broadinstitute.hellbender.tools.exome.Target) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) File(java.io.File) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest)

Example 68 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.

@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
    final Median median = new Median();
    final RealMatrix counts = readCount.counts();
    final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
    final double threshold = new Percentile(percentile).evaluate(targetMedians);
    final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
    final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
    final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
    Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
    Assert.assertEquals(result.getRight().length, toBeKeptCount);
    int nextIndex = 0;
    for (int i = 0; i < toBeKept.length; i++) {
        if (toBeKept[i]) {
            int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
            Assert.assertEquals(index, nextIndex++);
            Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
            Assert.assertEquals(result.getRight()[index], targetMedians[i]);
        } else {
            Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 69 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class PCATangentNormalizationUtilsUnitTest method normalizeReadCountByTargetFactorsData.

@DataProvider(name = "normalizeReadCountByTargetFactorsData")
public Object[][] normalizeReadCountByTargetFactorsData() {
    final List<Object[]> result = new ArrayList<>(1);
    @SuppressWarnings("serial") final List<Target> targets = new ArrayList<Target>() {

        {
            add(new Target("A"));
            add(new Target("B"));
            add(new Target("C"));
        }
    };
    @SuppressWarnings("serial") final List<String> columnNames = new ArrayList<String>() {

        {
            add("1");
            add("2");
            add("3");
        }
    };
    result.add(new Object[] { new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(new double[][] { new double[] { 1.1, 2.2, 3.3 }, new double[] { 0.1, 0.2, 0.3 }, new double[] { 11.1, 22.2, 33.3 } }, false)), new double[] { 100.0, 200.0, 300.0 } });
    return result.toArray(new Object[1][]);
}
Also used : Target(org.broadinstitute.hellbender.tools.exome.Target) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) ArrayList(java.util.ArrayList) DataProvider(org.testng.annotations.DataProvider)

Example 70 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class PCATangentNormalizationUtils method tangentNormalizeSpark.

/**
     * Tangent normalize given the raw PoN data using Spark:  the code here is a little more complex for optimization purposes.
     *
     *  Please see notes in docs/PoN ...
     *
     *  Ahat^T = (C^T P^T) A^T
     *  Therefore, C^T is the RowMatrix
     *
     *  pinv: P
     *  panel: A
     *  projection: Ahat
     *  cases: C
     *  betahat: C^T P^T
     *  tangentNormalizedCounts: C - Ahat
     */
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) {
    // Make the C^T a distributed matrix (RowMatrix)
    final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);
    // Spark local matrices (transposed)
    final Matrix pinvTLocalMat = new DenseMatrix(reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
    final Matrix panelTLocalMat = new DenseMatrix(reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose();
    // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
    final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
    final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
    final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();
    // Subtract the projection from the cases
    final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);
    // Construct the result object and return it with the correct targets.
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(betahatDistMat, tangentNormalizedCounts.getColumnDimension());
    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
Also used : RowMatrix(org.apache.spark.mllib.linalg.distributed.RowMatrix) DenseMatrix(org.apache.spark.mllib.linalg.DenseMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Matrix(org.apache.spark.mllib.linalg.Matrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) RowMatrix(org.apache.spark.mllib.linalg.distributed.RowMatrix) DenseMatrix(org.apache.spark.mllib.linalg.DenseMatrix)

Aggregations

ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)74 Test (org.testng.annotations.Test)48 Target (org.broadinstitute.hellbender.tools.exome.Target)40 File (java.io.File)30 IOException (java.io.IOException)30 Collectors (java.util.stream.Collectors)30 List (java.util.List)28 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)28 IntStream (java.util.stream.IntStream)26 Assert (org.testng.Assert)26 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)24 RealMatrix (org.apache.commons.math3.linear.RealMatrix)22 Median (org.apache.commons.math3.stat.descriptive.rank.Median)22 ArrayList (java.util.ArrayList)20 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)20 Logger (org.apache.logging.log4j.Logger)20 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)20 Mean (org.apache.commons.math3.stat.descriptive.moment.Mean)18 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 DoubleStream (java.util.stream.DoubleStream)16