use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class ReadsSparkSourceUnitTest method doLoadReads.
private void doLoadReads(String bam, String referencePath, ValidationStringency validationStringency) {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
ReadsSparkSource readSource = new ReadsSparkSource(ctx, validationStringency);
JavaRDD<GATKRead> rddSerialReads = getSerialReads(ctx, bam, referencePath, validationStringency);
JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(bam, referencePath);
List<GATKRead> serialReads = rddSerialReads.collect();
List<GATKRead> parallelReads = rddParallelReads.collect();
Assert.assertEquals(serialReads.size(), parallelReads.size());
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class ReadsSparkSourceUnitTest method testCRAMReferenceFromHDFS.
@Test(groups = "spark")
public void testCRAMReferenceFromHDFS() throws Exception {
final File cram = new File(NA12878_chr17_1k_CRAM);
final File reference = new File(v37_chr17_1Mb_Reference);
final File referenceIndex = new File(v37_chr17_1Mb_Reference + ".fai");
MiniClusterUtils.runOnIsolatedMiniCluster(cluster -> {
final Path workingDirectory = MiniClusterUtils.getWorkingDir(cluster);
final Path cramHDFSPath = new Path(workingDirectory, "hdfs.cram");
final Path refHDFSPath = new Path(workingDirectory, "hdfs.fasta");
final Path refIndexHDFSPath = new Path(workingDirectory, "hdfs.fasta.fai");
cluster.getFileSystem().copyFromLocalFile(new Path(cram.toURI()), cramHDFSPath);
cluster.getFileSystem().copyFromLocalFile(new Path(reference.toURI()), refHDFSPath);
cluster.getFileSystem().copyFromLocalFile(new Path(referenceIndex.toURI()), refIndexHDFSPath);
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final ReadsSparkSource readsSparkSource = new ReadsSparkSource(ctx);
final List<GATKRead> localReads = readsSparkSource.getParallelReads(cram.toURI().toString(), reference.toURI().toString()).collect();
final List<GATKRead> hdfsReads = readsSparkSource.getParallelReads(cramHDFSPath.toUri().toString(), refHDFSPath.toUri().toString()).collect();
Assert.assertFalse(localReads.isEmpty());
Assert.assertEquals(localReads, hdfsReads);
});
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class RangePartitionCoalescerUnitTest method setup.
@BeforeTest
public void setup() {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
rdd = ctx.parallelize(ImmutableList.of("a", "b", "c"), 3);
partitions = rdd.rdd().partitions();
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class AlleleFractionModellerUnitTest method testMCMC.
private void testMCMC(final double meanBiasSimulated, final double biasVarianceSimulated, final double meanBiasExpected, final double biasVarianceExpected, final AllelicPanelOfNormals allelicPoN) {
LoggingUtils.setLoggingLevel(Log.LogLevel.INFO);
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final int numSamples = 150;
final int numBurnIn = 50;
final double averageHetsPerSegment = 50;
final int numSegments = 100;
final int averageDepth = 50;
final double outlierProbability = 0.02;
// note: the following tolerances could actually be made much smaller if we used more segments and/or
// more hets -- most of the error is the sampling error of a finite simulated data set, not numerical error of MCMC
final double minorFractionTolerance = 0.02;
final double meanBiasTolerance = 0.02;
final double biasVarianceTolerance = 0.01;
final double outlierProbabilityTolerance = 0.02;
final AlleleFractionSimulatedData simulatedData = new AlleleFractionSimulatedData(averageHetsPerSegment, numSegments, averageDepth, meanBiasSimulated, biasVarianceSimulated, outlierProbability);
final AlleleFractionModeller modeller = new AlleleFractionModeller(simulatedData.getSegmentedGenome(), allelicPoN);
modeller.fitMCMC(numSamples, numBurnIn);
final List<Double> meanBiasSamples = modeller.getmeanBiasSamples();
Assert.assertEquals(meanBiasSamples.size(), numSamples - numBurnIn);
final List<Double> biasVarianceSamples = modeller.getBiasVarianceSamples();
Assert.assertEquals(biasVarianceSamples.size(), numSamples - numBurnIn);
final List<Double> outlierProbabilitySamples = modeller.getOutlierProbabilitySamples();
Assert.assertEquals(outlierProbabilitySamples.size(), numSamples - numBurnIn);
final List<AlleleFractionState.MinorFractions> minorFractionsSamples = modeller.getMinorFractionsSamples();
Assert.assertEquals(minorFractionsSamples.size(), numSamples - numBurnIn);
for (final AlleleFractionState.MinorFractions sample : minorFractionsSamples) {
Assert.assertEquals(sample.size(), numSegments);
}
final List<List<Double>> minorFractionsSamplesBySegment = modeller.getMinorFractionSamplesBySegment();
final double mcmcMeanBias = meanBiasSamples.stream().mapToDouble(x -> x).average().getAsDouble();
final double mcmcBiasVariance = biasVarianceSamples.stream().mapToDouble(x -> x).average().getAsDouble();
final double mcmcOutlierProbability = outlierProbabilitySamples.stream().mapToDouble(x -> x).average().getAsDouble();
final List<Double> mcmcMinorFractions = minorFractionsSamplesBySegment.stream().map(list -> list.stream().mapToDouble(x -> x).average().getAsDouble()).collect(Collectors.toList());
double totalSegmentError = 0.0;
for (int segment = 0; segment < numSegments; segment++) {
totalSegmentError += Math.abs(mcmcMinorFractions.get(segment) - simulatedData.getTrueState().segmentMinorFraction(segment));
}
Assert.assertEquals(mcmcMeanBias, meanBiasExpected, meanBiasTolerance);
Assert.assertEquals(mcmcBiasVariance, biasVarianceExpected, biasVarianceTolerance);
Assert.assertEquals(mcmcOutlierProbability, outlierProbability, outlierProbabilityTolerance);
Assert.assertEquals(totalSegmentError / numSegments, 0.0, minorFractionTolerance);
//test posterior summaries
final Map<AlleleFractionParameter, PosteriorSummary> globalParameterPosteriorSummaries = modeller.getGlobalParameterPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
final PosteriorSummary meanBiasPosteriorSummary = globalParameterPosteriorSummaries.get(AlleleFractionParameter.MEAN_BIAS);
final double meanBiasPosteriorCenter = meanBiasPosteriorSummary.getCenter();
Assert.assertEquals(meanBiasPosteriorCenter, meanBiasExpected, meanBiasTolerance);
final PosteriorSummary biasVariancePosteriorSummary = globalParameterPosteriorSummaries.get(AlleleFractionParameter.BIAS_VARIANCE);
final double biasVariancePosteriorCenter = biasVariancePosteriorSummary.getCenter();
Assert.assertEquals(biasVariancePosteriorCenter, biasVarianceExpected, biasVarianceTolerance);
final PosteriorSummary outlierProbabilityPosteriorSummary = globalParameterPosteriorSummaries.get(AlleleFractionParameter.OUTLIER_PROBABILITY);
final double outlierProbabilityPosteriorCenter = outlierProbabilityPosteriorSummary.getCenter();
Assert.assertEquals(outlierProbabilityPosteriorCenter, outlierProbability, outlierProbabilityTolerance);
final List<PosteriorSummary> minorAlleleFractionPosteriorSummaries = modeller.getMinorAlleleFractionsPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
final List<Double> minorFractionsPosteriorCenters = minorAlleleFractionPosteriorSummaries.stream().map(PosteriorSummary::getCenter).collect(Collectors.toList());
double totalPosteriorCentersSegmentError = 0.0;
for (int segment = 0; segment < numSegments; segment++) {
totalPosteriorCentersSegmentError += Math.abs(minorFractionsPosteriorCenters.get(segment) - simulatedData.getTrueState().segmentMinorFraction(segment));
}
Assert.assertEquals(totalPosteriorCentersSegmentError / numSegments, 0.0, minorFractionTolerance);
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class AlleleFractionModellerUnitTest method testBiasCorrection.
/**
* Tests that the allelic PoN is appropriately used to correct reference bias. The basic set up for the test data is
* simulated hets at 1000 sites (1:1-1000) across 3 segments. The outer two segments are balanced with
* minor-allele fraction = 0.5; however, in the middle segment consisting of 100 sites (1:451-550), all of the sites
*
* <p>
* 1) are balanced and have biases identical to the sites in the other two segments,
* which are drawn from a gamma distribution with alpha = 65, beta = 60 -> mean bias = 1.083 ("SAMPLE_NORMAL")
* </p>
*
* <p>
* 2) are balanced and have relatively high biases,
* which are drawn from a gamma distribution with alpha = 9, beta = 6 -> mean bias = 1.5 ("SAMPLE_WITH_BAD_SNPS")
* </p>
*
* <p>
* 3) have minor-allele fraction = 0.33, copy ratio = 1.5, and biases identical to the sites in the other two segments,
* which are drawn from a gamma distribution with alpha = 65, beta = 60 -> mean bias = 1.083 ("SAMPLE_EVENT").
* </p>
*
* In this segment, using a PoN that doesn't know about the high reference bias of these sites ("ALLELIC_PON_NORMAL")
* we should infer a minor-allele fraction of 6 / (6 + 9) = 0.40 in scenario 2; however, with a PoN that does know
* about the high bias at these sites ("ALLELIC_PON_WITH_BAD_SNPS") we correctly infer that all of the segments are balanced.
*
* <p>
* Note that alpha and beta are not actually correctly recovered in this PoN via MLE because the biases are
* drawn from a mixture of gamma distributions (as opposed to a single gamma distribution as assumed in the model).
* TODO https://github.com/broadinstitute/gatk-protected/issues/421
* </p>
*/
@Test(dataProvider = "biasCorrection")
public void testBiasCorrection(final AllelicCountCollection sample, final AllelicPanelOfNormals allelicPoN, final double minorFractionExpectedInMiddleSegment) {
LoggingUtils.setLoggingLevel(Log.LogLevel.INFO);
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final double minorFractionTolerance = 0.025;
final Genome genome = new Genome(AlleleFractionSimulatedData.TRIVIAL_TARGETS, sample.getCounts());
final List<SimpleInterval> segments = SegmentUtils.readIntervalsFromSegmentFile(SEGMENTS_FILE);
final SegmentedGenome segmentedGenome = new SegmentedGenome(segments, genome);
final int numSamples = 150;
final int numBurnIn = 50;
final AlleleFractionModeller modeller = new AlleleFractionModeller(segmentedGenome, allelicPoN);
modeller.fitMCMC(numSamples, numBurnIn);
final List<PosteriorSummary> minorAlleleFractionPosteriorSummaries = modeller.getMinorAlleleFractionsPosteriorSummaries(CREDIBLE_INTERVAL_ALPHA, ctx);
final List<Double> minorFractionsResult = minorAlleleFractionPosteriorSummaries.stream().map(PosteriorSummary::getCenter).collect(Collectors.toList());
final double minorFractionBalanced = 0.5;
final List<Double> minorFractionsExpected = Arrays.asList(minorFractionBalanced, minorFractionExpectedInMiddleSegment, minorFractionBalanced);
for (int segment = 0; segment < 3; segment++) {
Assert.assertEquals(minorFractionsResult.get(segment), minorFractionsExpected.get(segment), minorFractionTolerance);
}
}
Aggregations