Search in sources :

Example 1 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class CoveragePoNQCUtils method hasSuspiciousContigs.

/**
     *  Given a single sample tangent normalization (or other coverage profile), determine whether any contig looks like
     *   it has an arm level event (defined as 25% (or more) of the contig amplified/deleted)
     *
     * @param singleSampleTangentNormalized Tangent normalized data for a single sample.
     * @return never {@code null}
     */
private static Boolean hasSuspiciousContigs(final ReadCountCollection singleSampleTangentNormalized, final Map<String, Double> contigToMedian) {
    final List<String> allContigsPresent = retrieveAllContigsPresent(singleSampleTangentNormalized);
    for (String contig : allContigsPresent) {
        final ReadCountCollection oneContigReadCountCollection = singleSampleTangentNormalized.subsetTargets(singleSampleTangentNormalized.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
        final RealVector counts = oneContigReadCountCollection.counts().getColumnVector(0);
        for (int i = 0; i < 4; i++) {
            final RealVector partitionCounts = counts.getSubVector(i * counts.getDimension() / 4, counts.getDimension() / 4);
            final double[] partitionArray = DoubleStream.of(partitionCounts.toArray()).map(d -> Math.pow(2, d)).sorted().toArray();
            double median = new Median().evaluate(partitionArray);
            final double medianShiftInCRSpace = contigToMedian.getOrDefault(contig, 1.0) - 1.0;
            median -= medianShiftInCRSpace;
            if ((median > AMP_THRESHOLD) || (median < DEL_THRESHOLD)) {
                logger.info("Suspicious contig: " + singleSampleTangentNormalized.columnNames().get(0) + " " + contig + " (" + median + " -- " + i + ")");
                return true;
            }
        }
    }
    return false;
}
Also used : RealVector(org.apache.commons.math3.linear.RealVector) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median)

Example 2 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class CoveragePoNQCUtils method getContigToMedianCRMap.

@VisibleForTesting
static Map<String, Double> getContigToMedianCRMap(final ReadCountCollection readCountCollection) {
    final List<String> allContigsPresent = retrieveAllContigsPresent(readCountCollection);
    final Map<String, Double> contigToMedian = new LinkedHashMap<>();
    for (String contig : allContigsPresent) {
        final ReadCountCollection oneContigReadCountCollection = readCountCollection.subsetTargets(readCountCollection.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
        final double[] flatCounts = Doubles.concat(oneContigReadCountCollection.counts().getData());
        // Put into CRSpace
        final double[] flatCountsInCRSpace = DoubleStream.of(flatCounts).map(d -> Math.pow(2, d)).toArray();
        contigToMedian.put(contig, new Median().evaluate(flatCountsInCRSpace));
    }
    return contigToMedian;
}
Also used : Broadcast(org.apache.spark.broadcast.Broadcast) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) RealVector(org.apache.commons.math3.linear.RealVector) Collectors(java.util.stream.Collectors) DoubleStream(java.util.stream.DoubleStream) LinkedHashMap(java.util.LinkedHashMap) List(java.util.List) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Target(org.broadinstitute.hellbender.tools.exome.Target) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Map(java.util.Map) Doubles(com.google.common.primitives.Doubles) Utils(org.broadinstitute.hellbender.utils.Utils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) LogManager(org.apache.logging.log4j.LogManager) Collections(java.util.Collections) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) LinkedHashMap(java.util.LinkedHashMap) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class PCATangentNormalizationUtils method tangentNormalizeSpark.

/**
     * Tangent normalize given the raw PoN data using Spark:  the code here is a little more complex for optimization purposes.
     *
     *  Please see notes in docs/PoN ...
     *
     *  Ahat^T = (C^T P^T) A^T
     *  Therefore, C^T is the RowMatrix
     *
     *  pinv: P
     *  panel: A
     *  projection: Ahat
     *  cases: C
     *  betahat: C^T P^T
     *  tangentNormalizedCounts: C - Ahat
     */
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) {
    // Make the C^T a distributed matrix (RowMatrix)
    final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);
    // Spark local matrices (transposed)
    final Matrix pinvTLocalMat = new DenseMatrix(reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
    final Matrix panelTLocalMat = new DenseMatrix(reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose();
    // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
    final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
    final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
    final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();
    // Subtract the projection from the cases
    final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);
    // Construct the result object and return it with the correct targets.
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(betahatDistMat, tangentNormalizedCounts.getColumnDimension());
    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
Also used : RowMatrix(org.apache.spark.mllib.linalg.distributed.RowMatrix) DenseMatrix(org.apache.spark.mllib.linalg.DenseMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Matrix(org.apache.spark.mllib.linalg.Matrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) RowMatrix(org.apache.spark.mllib.linalg.distributed.RowMatrix) DenseMatrix(org.apache.spark.mllib.linalg.DenseMatrix)

Example 4 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class PCATangentNormalizationUtils method tangentNormalizeNonSpark.

/**
     * Tangent normalize given the raw PoN data without using Spark.
     */
private static PCATangentNormalizationResult tangentNormalizeNonSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts) {
    // Calculate the beta-hats for the input read count columns (samples).
    logger.info("Calculating beta hats...");
    final RealMatrix tangentBetaHats = calculateBetaHats(reducedPanelPInvCounts, tangentNormalizationInputCounts, EPSILON);
    // Actual tangent normalization step.
    logger.info("Performing actual tangent normalization (" + tangentNormalizationInputCounts.getColumnDimension() + " columns)...");
    final RealMatrix tangentNormalizedCounts = tangentNormalize(reducedPanelCounts, tangentNormalizationInputCounts, tangentBetaHats);
    // Output the tangent normalized counts.
    logger.info("Post-processing tangent normalization results...");
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts);
}
Also used : RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection)

Example 5 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubtractMedianOfMedians.

@Test(dataProvider = "readCountOnlyData")
public void testSubtractMedianOfMedians(final ReadCountCollection readCounts) {
    final RealMatrix counts = readCounts.counts();
    final Median median = new Median();
    final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
    final double center = median.evaluate(columnMedians);
    final double[][] expected = new double[counts.getRowDimension()][];
    for (int i = 0; i < expected.length; i++) {
        expected[i] = counts.getRow(i).clone();
        for (int j = 0; j < expected[i].length; j++) {
            expected[i][j] -= center;
        }
    }
    HDF5PCACoveragePoNCreationUtils.subtractMedianOfMedians(readCounts, NULL_LOGGER);
    final RealMatrix newCounts = readCounts.counts();
    Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
    Assert.assertEquals(newCounts.getRowDimension(), expected.length);
    for (int i = 0; i < expected.length; i++) {
        for (int j = 0; j < expected[i].length; j++) {
            Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) SVD(org.broadinstitute.hellbender.utils.svd.SVD) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Level(org.apache.logging.log4j.Level) MatrixSummaryUtils(org.broadinstitute.hellbender.utils.MatrixSummaryUtils) Test(org.testng.annotations.Test) Random(java.util.Random) OptionalInt(java.util.OptionalInt) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) ArrayList(java.util.ArrayList) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) Pair(org.apache.commons.lang3.tuple.Pair) Message(org.apache.logging.log4j.message.Message) Assert(org.testng.Assert) Median(org.apache.commons.math3.stat.descriptive.rank.Median) HDF5File(org.broadinstitute.hdf5.HDF5File) Marker(org.apache.logging.log4j.Marker) AbstractLogger(org.apache.logging.log4j.spi.AbstractLogger) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Stream(java.util.stream.Stream) Target(org.broadinstitute.hellbender.tools.exome.Target) SVDFactory(org.broadinstitute.hellbender.utils.svd.SVDFactory) RealMatrix(org.apache.commons.math3.linear.RealMatrix) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) PoNTestUtils(org.broadinstitute.hellbender.tools.pon.PoNTestUtils) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) Median(org.apache.commons.math3.stat.descriptive.rank.Median) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)74 Test (org.testng.annotations.Test)48 Target (org.broadinstitute.hellbender.tools.exome.Target)40 File (java.io.File)30 IOException (java.io.IOException)30 Collectors (java.util.stream.Collectors)30 List (java.util.List)28 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)28 IntStream (java.util.stream.IntStream)26 Assert (org.testng.Assert)26 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)24 RealMatrix (org.apache.commons.math3.linear.RealMatrix)22 Median (org.apache.commons.math3.stat.descriptive.rank.Median)22 ArrayList (java.util.ArrayList)20 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)20 Logger (org.apache.logging.log4j.Logger)20 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)20 Mean (org.apache.commons.math3.stat.descriptive.moment.Mean)18 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 DoubleStream (java.util.stream.DoubleStream)16