Search in sources :

Example 91 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class ReadsSparkSourceUnitTest method doLoadReads.

private void doLoadReads(String bam, String referencePath, ValidationStringency validationStringency) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    ReadsSparkSource readSource = new ReadsSparkSource(ctx, validationStringency);
    JavaRDD<GATKRead> rddSerialReads = getSerialReads(ctx, bam, referencePath, validationStringency);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(bam, referencePath);
    List<GATKRead> serialReads = rddSerialReads.collect();
    List<GATKRead> parallelReads = rddParallelReads.collect();
    Assert.assertEquals(serialReads.size(), parallelReads.size());
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 92 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class ReadsSparkSourceUnitTest method testCRAMReferenceFromHDFS.

@Test(groups = "spark")
public void testCRAMReferenceFromHDFS() throws Exception {
    final File cram = new File(NA12878_chr17_1k_CRAM);
    final File reference = new File(v37_chr17_1Mb_Reference);
    final File referenceIndex = new File(v37_chr17_1Mb_Reference + ".fai");
    MiniClusterUtils.runOnIsolatedMiniCluster(cluster -> {
        final Path workingDirectory = MiniClusterUtils.getWorkingDir(cluster);
        final Path cramHDFSPath = new Path(workingDirectory, "hdfs.cram");
        final Path refHDFSPath = new Path(workingDirectory, "hdfs.fasta");
        final Path refIndexHDFSPath = new Path(workingDirectory, "hdfs.fasta.fai");
        cluster.getFileSystem().copyFromLocalFile(new Path(cram.toURI()), cramHDFSPath);
        cluster.getFileSystem().copyFromLocalFile(new Path(reference.toURI()), refHDFSPath);
        cluster.getFileSystem().copyFromLocalFile(new Path(referenceIndex.toURI()), refIndexHDFSPath);
        final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
        final ReadsSparkSource readsSparkSource = new ReadsSparkSource(ctx);
        final List<GATKRead> localReads = readsSparkSource.getParallelReads(cram.toURI().toString(), reference.toURI().toString()).collect();
        final List<GATKRead> hdfsReads = readsSparkSource.getParallelReads(cramHDFSPath.toUri().toString(), refHDFSPath.toUri().toString()).collect();
        Assert.assertFalse(localReads.isEmpty());
        Assert.assertEquals(localReads, hdfsReads);
    });
}
Also used : Path(org.apache.hadoop.fs.Path) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) File(java.io.File) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 93 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class RangePartitionCoalescerUnitTest method setup.

@BeforeTest
public void setup() {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    rdd = ctx.parallelize(ImmutableList.of("a", "b", "c"), 3);
    partitions = rdd.rdd().partitions();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BeforeTest(org.testng.annotations.BeforeTest)

Example 94 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class AlleleFractionModellerUnitTest method testMCMC.

private void testMCMC(final double meanBiasSimulated, final double biasVarianceSimulated, final double meanBiasExpected, final double biasVarianceExpected, final AllelicPanelOfNormals allelicPoN) {
    LoggingUtils.setLoggingLevel(Log.LogLevel.INFO);
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final int numSamples = 150;
    final int numBurnIn = 50;
    final double averageHetsPerSegment = 50;
    final int numSegments = 100;
    final int averageDepth = 50;
    final double outlierProbability = 0.02;
    // note: the following tolerances could actually be made much smaller if we used more segments and/or
    // more hets -- most of the error is the sampling error of a finite simulated data set, not numerical error of MCMC
    final double minorFractionTolerance = 0.02;
    final double meanBiasTolerance = 0.02;
    final double biasVarianceTolerance = 0.01;
    final double outlierProbabilityTolerance = 0.02;
    final AlleleFractionSimulatedData simulatedData = new AlleleFractionSimulatedData(averageHetsPerSegment, numSegments, averageDepth, meanBiasSimulated, biasVarianceSimulated, outlierProbability);
    final AlleleFractionModeller modeller = new AlleleFractionModeller(simulatedData.getSegmentedGenome(), allelicPoN);
    modeller.fitMCMC(numSamples, numBurnIn);
    final List<Double> meanBiasSamples = modeller.getmeanBiasSamples();
    Assert.assertEquals(meanBiasSamples.size(), numSamples - numBurnIn);
    final List<Double> biasVarianceSamples = modeller.getBiasVarianceSamples();
    Assert.assertEquals(biasVarianceSamples.size(), numSamples - numBurnIn);
    final List<Double> outlierProbabilitySamples = modeller.getOutlierProbabilitySamples();
    Assert.assertEquals(outlierProbabilitySamples.size(), numSamples - numBurnIn);
    final List<AlleleFractionState.MinorFractions> minorFractionsSamples = modeller.getMinorFractionsSamples();
    Assert.assertEquals(minorFractionsSamples.size(), numSamples - numBurnIn);
    for (final AlleleFractionState.MinorFractions sample : minorFractionsSamples) {
        Assert.assertEquals(sample.size(), numSegments);
    }
    final List<List<Double>> minorFractionsSamplesBySegment = modeller.getMinorFractionSamplesBySegment();
    final double mcmcMeanBias = meanBiasSamples.stream().mapToDouble(x -> x).average().getAsDouble();
    final double mcmcBiasVariance = biasVarianceSamples.stream().mapToDouble(x -> x).average().getAsDouble();
    final double mcmcOutlierProbability = outlierProbabilitySamples.stream().mapToDouble(x -> x).average().getAsDouble();
    final List<Double> mcmcMinorFractions = minorFractionsSamplesBySegment.stream().map(list -> list.stream().mapToDouble(x -> x).average().getAsDouble()).collect(Collectors.toList());
    double totalSegmentError = 0.0;
    for (int segment = 0; segment < numSegments; segment++) {
        totalSegmentError += Math.abs(mcmcMinorFractions.get(segment) - simulatedData.getTrueState().segmentMinorFraction(segment));
    }
    Assert.assertEquals(mcmcMeanBias, meanBiasExpected, meanBiasTolerance);
    Assert.assertEquals(mcmcBiasVariance, biasVarianceExpected, biasVarianceTolerance);
    Assert.assertEquals(mcmcOutlierProbability, outlierProbability, outlierProbabilityTolerance);
    Assert.assertEquals(totalSegmentError / numSegments, 0.0, minorFractionTolerance);
    //test posterior summaries
    final Map<AlleleFractionParameter, PosteriorSummary> globalParameterPosteriorSummaries = modeller.getGlobalParameterPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
    final PosteriorSummary meanBiasPosteriorSummary = globalParameterPosteriorSummaries.get(AlleleFractionParameter.MEAN_BIAS);
    final double meanBiasPosteriorCenter = meanBiasPosteriorSummary.getCenter();
    Assert.assertEquals(meanBiasPosteriorCenter, meanBiasExpected, meanBiasTolerance);
    final PosteriorSummary biasVariancePosteriorSummary = globalParameterPosteriorSummaries.get(AlleleFractionParameter.BIAS_VARIANCE);
    final double biasVariancePosteriorCenter = biasVariancePosteriorSummary.getCenter();
    Assert.assertEquals(biasVariancePosteriorCenter, biasVarianceExpected, biasVarianceTolerance);
    final PosteriorSummary outlierProbabilityPosteriorSummary = globalParameterPosteriorSummaries.get(AlleleFractionParameter.OUTLIER_PROBABILITY);
    final double outlierProbabilityPosteriorCenter = outlierProbabilityPosteriorSummary.getCenter();
    Assert.assertEquals(outlierProbabilityPosteriorCenter, outlierProbability, outlierProbabilityTolerance);
    final List<PosteriorSummary> minorAlleleFractionPosteriorSummaries = modeller.getMinorAlleleFractionsPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
    final List<Double> minorFractionsPosteriorCenters = minorAlleleFractionPosteriorSummaries.stream().map(PosteriorSummary::getCenter).collect(Collectors.toList());
    double totalPosteriorCentersSegmentError = 0.0;
    for (int segment = 0; segment < numSegments; segment++) {
        totalPosteriorCentersSegmentError += Math.abs(minorFractionsPosteriorCenters.get(segment) - simulatedData.getTrueState().segmentMinorFraction(segment));
    }
    Assert.assertEquals(totalPosteriorCentersSegmentError / numSegments, 0.0, minorFractionTolerance);
}
Also used : Arrays(java.util.Arrays) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Genome(org.broadinstitute.hellbender.tools.exome.Genome) Test(org.testng.annotations.Test) SegmentUtils(org.broadinstitute.hellbender.tools.exome.SegmentUtils) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) Log(htsjdk.samtools.util.Log) Assert(org.testng.Assert) PosteriorSummary(org.broadinstitute.hellbender.utils.mcmc.PosteriorSummary) Map(java.util.Map) AllelicPanelOfNormals(org.broadinstitute.hellbender.tools.pon.allelic.AllelicPanelOfNormals) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) SegmentedGenome(org.broadinstitute.hellbender.tools.exome.SegmentedGenome) AllelicCountCollection(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCountCollection) LoggingUtils(org.broadinstitute.hellbender.utils.LoggingUtils) PosteriorSummary(org.broadinstitute.hellbender.utils.mcmc.PosteriorSummary) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 95 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class AlleleFractionModellerUnitTest method testBiasCorrection.

/**
     * Tests that the allelic PoN is appropriately used to correct reference bias.  The basic set up for the test data is
     * simulated hets at 1000 sites (1:1-1000) across 3 segments.  The outer two segments are balanced with
     * minor-allele fraction = 0.5; however, in the middle segment consisting of 100 sites (1:451-550), all of the sites
     *
     * <p>
     *     1) are balanced and have biases identical to the sites in the other two segments,
     *     which are drawn from a gamma distribution with alpha = 65, beta = 60 -> mean bias = 1.083 ("SAMPLE_NORMAL")
     * </p>
     *
     * <p>
     *     2) are balanced and have relatively high biases,
     *     which are drawn from a gamma distribution with alpha = 9, beta = 6 -> mean bias = 1.5 ("SAMPLE_WITH_BAD_SNPS")
     * </p>
     *
     * <p>
     *     3) have minor-allele fraction = 0.33, copy ratio = 1.5, and biases identical to the sites in the other two segments,
     *     which are drawn from a gamma distribution with alpha = 65, beta = 60 -> mean bias = 1.083 ("SAMPLE_EVENT").
     * </p>
     *
     * In this segment, using a PoN that doesn't know about the high reference bias of these sites ("ALLELIC_PON_NORMAL")
     * we should infer a minor-allele fraction of 6 / (6 + 9) = 0.40 in scenario 2; however, with a PoN that does know
     * about the high bias at these sites ("ALLELIC_PON_WITH_BAD_SNPS") we correctly infer that all of the segments are balanced.
     *
     * <p>
     *     Note that alpha and beta are not actually correctly recovered in this PoN via MLE because the biases are
     *     drawn from a mixture of gamma distributions (as opposed to a single gamma distribution as assumed in the model).
     *     TODO https://github.com/broadinstitute/gatk-protected/issues/421
     * </p>
     */
@Test(dataProvider = "biasCorrection")
public void testBiasCorrection(final AllelicCountCollection sample, final AllelicPanelOfNormals allelicPoN, final double minorFractionExpectedInMiddleSegment) {
    LoggingUtils.setLoggingLevel(Log.LogLevel.INFO);
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final double minorFractionTolerance = 0.025;
    final Genome genome = new Genome(AlleleFractionSimulatedData.TRIVIAL_TARGETS, sample.getCounts());
    final List<SimpleInterval> segments = SegmentUtils.readIntervalsFromSegmentFile(SEGMENTS_FILE);
    final SegmentedGenome segmentedGenome = new SegmentedGenome(segments, genome);
    final int numSamples = 150;
    final int numBurnIn = 50;
    final AlleleFractionModeller modeller = new AlleleFractionModeller(segmentedGenome, allelicPoN);
    modeller.fitMCMC(numSamples, numBurnIn);
    final List<PosteriorSummary> minorAlleleFractionPosteriorSummaries = modeller.getMinorAlleleFractionsPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
    final List<Double> minorFractionsResult = minorAlleleFractionPosteriorSummaries.stream().map(PosteriorSummary::getCenter).collect(Collectors.toList());
    final double minorFractionBalanced = 0.5;
    final List<Double> minorFractionsExpected = Arrays.asList(minorFractionBalanced, minorFractionExpectedInMiddleSegment, minorFractionBalanced);
    for (int segment = 0; segment < 3; segment++) {
        Assert.assertEquals(minorFractionsResult.get(segment), minorFractionsExpected.get(segment), minorFractionTolerance);
    }
}
Also used : PosteriorSummary(org.broadinstitute.hellbender.utils.mcmc.PosteriorSummary) SegmentedGenome(org.broadinstitute.hellbender.tools.exome.SegmentedGenome) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Genome(org.broadinstitute.hellbender.tools.exome.Genome) SegmentedGenome(org.broadinstitute.hellbender.tools.exome.SegmentedGenome) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)260 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 SparkConf (org.apache.spark.SparkConf)49 Tuple2 (scala.Tuple2)48 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)43 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)28 Configuration (org.apache.hadoop.conf.Configuration)24 JavaRDD (org.apache.spark.api.java.JavaRDD)24 File (java.io.File)23 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)14 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 HashSet (java.util.HashSet)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12