Search in sources :

Example 36 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class CoveragePoNQCUtils method getContigToMedianCRMap.

@VisibleForTesting
static Map<String, Double> getContigToMedianCRMap(final ReadCountCollection readCountCollection) {
    final List<String> allContigsPresent = retrieveAllContigsPresent(readCountCollection);
    final Map<String, Double> contigToMedian = new LinkedHashMap<>();
    for (String contig : allContigsPresent) {
        final ReadCountCollection oneContigReadCountCollection = readCountCollection.subsetTargets(readCountCollection.targets().stream().filter(t -> t.getContig().equals(contig)).collect(Collectors.toSet()));
        final double[] flatCounts = Doubles.concat(oneContigReadCountCollection.counts().getData());
        // Put into CRSpace
        final double[] flatCountsInCRSpace = DoubleStream.of(flatCounts).map(d -> Math.pow(2, d)).toArray();
        contigToMedian.put(contig, new Median().evaluate(flatCountsInCRSpace));
    }
    return contigToMedian;
}
Also used : Broadcast(org.apache.spark.broadcast.Broadcast) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) RealVector(org.apache.commons.math3.linear.RealVector) Collectors(java.util.stream.Collectors) DoubleStream(java.util.stream.DoubleStream) LinkedHashMap(java.util.LinkedHashMap) List(java.util.List) Logger(org.apache.logging.log4j.Logger) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Target(org.broadinstitute.hellbender.tools.exome.Target) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Map(java.util.Map) Doubles(com.google.common.primitives.Doubles) Utils(org.broadinstitute.hellbender.utils.Utils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) LogManager(org.apache.logging.log4j.LogManager) Collections(java.util.Collections) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) LinkedHashMap(java.util.LinkedHashMap) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 37 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.

the class PCATangentNormalizationUtils method tangentNormalizeNonSpark.

/**
     * Tangent normalize given the raw PoN data without using Spark.
     */
private static PCATangentNormalizationResult tangentNormalizeNonSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts) {
    // Calculate the beta-hats for the input read count columns (samples).
    logger.info("Calculating beta hats...");
    final RealMatrix tangentBetaHats = calculateBetaHats(reducedPanelPInvCounts, tangentNormalizationInputCounts, EPSILON);
    // Actual tangent normalization step.
    logger.info("Performing actual tangent normalization (" + tangentNormalizationInputCounts.getColumnDimension() + " columns)...");
    final RealMatrix tangentNormalizedCounts = tangentNormalize(reducedPanelCounts, tangentNormalizationInputCounts, tangentBetaHats);
    // Output the tangent normalized counts.
    logger.info("Post-processing tangent normalization results...");
    final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
    final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
    return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts);
}
Also used : RealMatrix(org.apache.commons.math3.linear.RealMatrix) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection)

Example 38 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class GCCorrector method correctCoverage.

/**
     *
     * @param inputCounts raw coverage before GC correction
     * @param gcContentByTarget      array of gc contents, one per target of the input
     * @return              GC-corrected coverage
     */
public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) {
    // each column (sample) has its own GC bias curve, hence its own GC corrector
    final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size()).mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList());
    // gc correct a copy of the input counts in-place
    final RealMatrix correctedCounts = inputCounts.counts().copy();
    correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(int target, int column, double coverage) {
            return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]);
        }
    });
    // we would like the average correction factor to be 1.0 in the sense that average coverage before and after
    // correction should be equal
    final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size()).mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray();
    correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {

        @Override
        public double visit(int target, int column, double coverage) {
            return coverage * columnNormalizationFactors[column];
        }
    });
    return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts);
}
Also used : IntStream(java.util.stream.IntStream) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) List(java.util.List) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Utils(org.broadinstitute.hellbender.utils.Utils) RealMatrix(org.apache.commons.math3.linear.RealMatrix) RealVector(org.apache.commons.math3.linear.RealVector) ArrayRealVector(org.apache.commons.math3.linear.ArrayRealVector) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) RealMatrix(org.apache.commons.math3.linear.RealMatrix) DefaultRealMatrixChangingVisitor(org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection)

Example 39 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class CopyRatioSegmenterUnitTest method testSegmentation.

@Test
public void testSegmentation() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));
    final List<Double> trueWeights = Arrays.asList(0.2, 0.5, 0.3);
    final List<Double> trueLog2CopyRatios = Arrays.asList(-2.0, 0.0, 1.4);
    final double trueMemoryLength = 1e5;
    final double trueStandardDeviation = 0.2;
    final CopyRatioHMM trueModel = new CopyRatioHMM(trueLog2CopyRatios, trueWeights, trueMemoryLength, trueStandardDeviation);
    final int chainLength = 10000;
    final List<SimpleInterval> positions = randomPositions("chr1", chainLength, rng, trueMemoryLength / 4);
    final List<Integer> trueStates = trueModel.generateHiddenStateChain(positions);
    final List<Double> trueLog2CopyRatioSequence = trueStates.stream().map(n -> trueLog2CopyRatios.get(n)).collect(Collectors.toList());
    final List<Double> data = trueLog2CopyRatioSequence.stream().map(cr -> generateData(trueStandardDeviation, cr, rng)).collect(Collectors.toList());
    final List<Target> targets = positions.stream().map(Target::new).collect(Collectors.toList());
    final ReadCountCollection rcc = new ReadCountCollection(targets, Arrays.asList("SAMPLE"), new Array2DRowRealMatrix(data.stream().mapToDouble(x -> x).toArray()));
    final CopyRatioSegmenter segmenter = new CopyRatioSegmenter(10, rcc);
    final List<ModeledSegment> segments = segmenter.getModeledSegments();
    final double[] segmentCopyRatios = segments.stream().flatMap(s -> Collections.nCopies((int) s.getTargetCount(), s.getSegmentMeanInLog2CRSpace()).stream()).mapToDouble(x -> x).toArray();
    final double averageCopyRatioError = IntStream.range(0, trueLog2CopyRatioSequence.size()).mapToDouble(n -> Math.abs(segmentCopyRatios[n] - trueLog2CopyRatioSequence.get(n))).average().getAsDouble();
    Assert.assertEquals(averageCopyRatioError, 0, 0.025);
}
Also used : IntStream(java.util.stream.IntStream) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) java.util(java.util) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) ModeledSegment(org.broadinstitute.hellbender.tools.exome.ModeledSegment) Assert(org.testng.Assert) RandomGenerator(org.apache.commons.math3.random.RandomGenerator) RandomGeneratorFactory(org.apache.commons.math3.random.RandomGeneratorFactory) Target(org.broadinstitute.hellbender.tools.exome.Target) Test(org.testng.annotations.Test) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) RandomGenerator(org.apache.commons.math3.random.RandomGenerator) Target(org.broadinstitute.hellbender.tools.exome.Target) Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) ModeledSegment(org.broadinstitute.hellbender.tools.exome.ModeledSegment) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Test(org.testng.annotations.Test)

Example 40 with ReadCountCollection

use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.

the class SparkGenomeReadCountsIntegrationTest method testSparkGenomeReadCountsSmallBins.

@Test
public void testSparkGenomeReadCountsSmallBins() throws IOException {
    final File outputFile = createTempFile(BAM_FILE.getName(), ".cov");
    final String[] arguments = { "--disableSequenceDictionaryValidation", "-" + StandardArgumentDefinitions.REFERENCE_SHORT_NAME, REFERENCE_FILE.getAbsolutePath(), "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, BAM_FILE.getAbsolutePath(), "-" + SparkGenomeReadCounts.OUTPUT_FILE_SHORT_NAME, outputFile.getAbsolutePath(), "-" + SparkGenomeReadCounts.BINSIZE_SHORT_NAME, "2000" };
    runCommandLine(arguments);
    Assert.assertTrue(outputFile.exists());
    Assert.assertTrue(outputFile.length() > 0);
    // Proportional Coverage
    final ReadCountCollection proportionalCoverage = ReadCountCollectionUtils.parse(outputFile);
    Assert.assertTrue(proportionalCoverage.records().stream().anyMatch(t -> Math.abs(t.getDouble(0)) > 1e-10));
    // The reads are all in three bins of contig 3 with values {.5, .25, .25}
    Assert.assertTrue(proportionalCoverage.records().stream().filter(t -> t.getContig().equals("3")).anyMatch(t -> Math.abs(t.getDouble(0)) > .2));
    Assert.assertTrue(Math.abs(proportionalCoverage.records().stream().filter(t -> t.getContig().equals("3")).mapToDouble(t -> t.getDouble(0)).sum() - 1.0) < 1e-10);
    // raw coverage
    final ReadCountCollection coverage = ReadCountCollectionUtils.parse(new File(outputFile.getAbsolutePath() + SparkGenomeReadCounts.RAW_COV_OUTPUT_EXTENSION));
    Assert.assertTrue(coverage.records().stream().anyMatch(t -> Math.abs(t.getDouble(0)) > 1e-10));
    // The reads are all in three bins of contig 3 with values
    Assert.assertEquals(coverage.records().stream().filter(t -> t.getContig().equals("3")).filter(t -> Math.abs(t.getDouble(0)) >= 1).count(), 3);
    final File targetsFile = new File(outputFile.getAbsolutePath() + ".targets.tsv");
    Assert.assertTrue(targetsFile.exists());
    Assert.assertTrue(targetsFile.length() > 0);
    final List<Target> targets = TargetTableReader.readTargetFile(targetsFile);
    // 4 is the number of contigs in the fasta file
    Assert.assertEquals(targets.size(), 16000 / 2000 * 4);
    Assert.assertEquals(targets.get(1).getEnd(), 4000);
    Assert.assertEquals(targets.get(2).getName(), "target_1_4001_6000");
    Assert.assertEquals(targets.get(8).getName(), "target_2_1_2000");
    Assert.assertEquals(targets.get(17).getName(), "target_3_2001_4000");
    Assert.assertEquals(proportionalCoverage.targets().size(), targets.size());
}
Also used : List(java.util.List) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) Assert(org.testng.Assert) Target(org.broadinstitute.hellbender.tools.exome.Target) TargetTableReader(org.broadinstitute.hellbender.tools.exome.TargetTableReader) ReadCountCollectionUtils(org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) Test(org.testng.annotations.Test) IOException(java.io.IOException) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) File(java.io.File) Target(org.broadinstitute.hellbender.tools.exome.Target) ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) File(java.io.File) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest)

Aggregations

ReadCountCollection (org.broadinstitute.hellbender.tools.exome.ReadCountCollection)74 Test (org.testng.annotations.Test)48 Target (org.broadinstitute.hellbender.tools.exome.Target)40 File (java.io.File)30 IOException (java.io.IOException)30 Collectors (java.util.stream.Collectors)30 List (java.util.List)28 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)28 IntStream (java.util.stream.IntStream)26 Assert (org.testng.Assert)26 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)24 RealMatrix (org.apache.commons.math3.linear.RealMatrix)22 Median (org.apache.commons.math3.stat.descriptive.rank.Median)22 ArrayList (java.util.ArrayList)20 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)20 Logger (org.apache.logging.log4j.Logger)20 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)20 Mean (org.apache.commons.math3.stat.descriptive.moment.Mean)18 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 DoubleStream (java.util.stream.DoubleStream)16