use of org.apache.spark.api.java.JavaSparkContext in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testCalculateReducedPanelAndPInversesKeepingAllColumns.
@Test(dataProvider = "readCountOnlyWithDiverseShapeData")
public void testCalculateReducedPanelAndPInversesKeepingAllColumns(final ReadCountCollection readCounts) {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final ReductionResult result = HDF5PCACoveragePoNCreationUtils.calculateReducedPanelAndPInverses(readCounts, OptionalInt.of(readCounts.columnNames().size()), NULL_LOGGER, ctx);
final RealMatrix counts = readCounts.counts();
Assert.assertNotNull(result);
Assert.assertNotNull(result.getPseudoInverse());
Assert.assertNotNull(result.getReducedCounts());
Assert.assertNotNull(result.getReducedPseudoInverse());
Assert.assertNotNull(result.getAllSingularValues());
Assert.assertEquals(counts.getColumnDimension(), result.getAllSingularValues().length);
Assert.assertEquals(result.getReducedCounts().getRowDimension(), counts.getRowDimension());
Assert.assertEquals(result.getReducedCounts().getColumnDimension(), readCounts.columnNames().size());
final int eigensamples = result.getReducedCounts().getColumnDimension();
Assert.assertEquals(eigensamples, readCounts.columnNames().size());
assertPseudoInverse(counts, result.getPseudoInverse());
assertPseudoInverse(result.getReducedCounts(), result.getReducedPseudoInverse());
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class FindBreakpointEvidenceSpark method getKmerIntervals.
/** find kmers for each interval */
@VisibleForTesting
static Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> getKmerIntervals(final Params params, final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final int nIntervals, final Set<SVKmer> kmerKillSet, final JavaRDD<GATKRead> reads, final Locations locations) {
final Broadcast<Set<SVKmer>> broadcastKmerKillSet = ctx.broadcast(kmerKillSet);
final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNameAndIntervalsMultiMap = ctx.broadcast(qNamesMultiMap);
// given a set of template names with interval IDs and a kill set of ubiquitous kmers,
// produce a set of interesting kmers for each interval ID
final int kmersPerPartitionGuess = params.cleanerKmersPerPartitionGuess;
final int minKmers = params.cleanerMinKmerCount;
final int maxKmers = params.cleanerMaxKmerCount;
final int maxIntervals = params.cleanerMaxIntervals;
final int kSize = params.kSize;
final int maxDUSTScore = params.maxDUSTScore;
final List<KmerAndInterval> kmerIntervals = reads.mapPartitionsToPair(readItr -> new MapPartitioner<>(readItr, new QNameKmerizer(broadcastQNameAndIntervalsMultiMap.value(), broadcastKmerKillSet.value(), kSize, maxDUSTScore)).iterator(), false).reduceByKey(Integer::sum).mapPartitions(itr -> new KmerCleaner(itr, kmersPerPartitionGuess, minKmers, maxKmers, maxIntervals).iterator()).collect();
broadcastQNameAndIntervalsMultiMap.destroy();
broadcastKmerKillSet.destroy();
final int[] intervalKmerCounts = new int[nIntervals];
for (final KmerAndInterval kmerAndInterval : kmerIntervals) {
intervalKmerCounts[kmerAndInterval.getIntervalId()] += 1;
}
final Set<Integer> intervalsToKill = new HashSet<>();
final List<AlignedAssemblyOrExcuse> intervalDispositions = new ArrayList<>();
for (int idx = 0; idx != nIntervals; ++idx) {
if (intervalKmerCounts[idx] < params.minKmersPerInterval) {
intervalsToKill.add(idx);
intervalDispositions.add(new AlignedAssemblyOrExcuse(idx, "FASTQ not written -- too few kmers"));
}
}
qNamesMultiMap.removeIf(qNameAndInterval -> intervalsToKill.contains(qNameAndInterval.getIntervalId()));
final List<KmerAndInterval> filteredKmerIntervals = kmerIntervals.stream().filter(kmerAndInterval -> !intervalsToKill.contains(kmerAndInterval.getIntervalId())).collect(SVUtils.arrayListCollector(kmerIntervals.size()));
// record the kmers with their interval IDs
if (locations.kmerFile != null) {
try (final OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(BucketUtils.createFile(locations.kmerFile)))) {
for (final KmerAndInterval kmerAndInterval : filteredKmerIntervals) {
writer.write(kmerAndInterval.toString(kSize) + " " + kmerAndInterval.getIntervalId() + "\n");
}
} catch (final IOException ioe) {
throw new GATKException("Can't write kmer intervals file " + locations.kmerFile, ioe);
}
}
return new Tuple2<>(intervalDispositions, filteredKmerIntervals);
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class PosteriorSummaryUtilsUnitTest method testCalculatePosteriorMode.
@Test(dataProvider = "dataKernelDensityEstimation")
public void testCalculatePosteriorMode(final List<Double> samples, final double credibleIntervalAlpha, final double relativeError, final PosteriorSummary expected) {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final double result = PosteriorSummaryUtils.calculatePosteriorMode(samples, ctx);
Assert.assertTrue(withinRelativeError(result, expected.getCenter(), relativeError));
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk-protected by broadinstitute.
the class CNLOHCaller method calcNewRhos.
private double[] calcNewRhos(final List<ACNVModeledSegment> segments, final List<double[][][]> responsibilitiesBySeg, final double lambda, final double[] rhos, final int[] mVals, final int[] nVals, final JavaSparkContext ctx) {
// Since, we pass in the entire responsibilities matrix, we need the correct index for each rho. That, and the
// fact that this is a univariate objective function, means we need to create an instance for each rho. And
// then we blast across Spark.
final List<Pair<? extends Function<Double, Double>, SearchInterval>> objectives = IntStream.range(0, rhos.length).mapToObj(i -> new Pair<>(new Function<Double, Double>() {
@Override
public Double apply(Double rho) {
return calculateESmnObjective(rho, segments, responsibilitiesBySeg, mVals, nVals, lambda, i);
}
}, new SearchInterval(0.0, 1.0, rhos[i]))).collect(Collectors.toList());
final JavaRDD<Pair<? extends Function<Double, Double>, SearchInterval>> objectivesRDD = ctx.parallelize(objectives);
final List<Double> resultsAsDouble = objectivesRDD.map(objective -> optimizeIt(objective.getFirst(), objective.getSecond())).collect();
return resultsAsDouble.stream().mapToDouble(Double::doubleValue).toArray();
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class AddContextDataToReadSparkUnitTest method addContextDataTest.
@Test(dataProvider = "bases", groups = "spark")
public void addContextDataTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, ReadContextData>> expectedReadContextData, JoinStrategy joinStrategy) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
when(mockSource.getReferenceBases(any(PipelineOptions.class), any())).then(new ReferenceBasesAnswer());
when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
SAMSequenceDictionary sd = new SAMSequenceDictionary(Lists.newArrayList(new SAMSequenceRecord("1", 100000), new SAMSequenceRecord("2", 100000)));
when(mockSource.getReferenceSequenceDictionary(null)).thenReturn(sd);
JavaPairRDD<GATKRead, ReadContextData> rddActual = AddContextDataToReadSpark.add(ctx, rddReads, mockSource, rddVariants, joinStrategy, sd, 10000, 1000);
Map<GATKRead, ReadContextData> actual = rddActual.collectAsMap();
Assert.assertEquals(actual.size(), expectedReadContextData.size());
for (KV<GATKRead, ReadContextData> kv : expectedReadContextData) {
ReadContextData readContextData = actual.get(kv.getKey());
Assert.assertNotNull(readContextData);
Assert.assertTrue(CollectionUtils.isEqualCollection(Lists.newArrayList(readContextData.getOverlappingVariants()), Lists.newArrayList(kv.getValue().getOverlappingVariants())));
SimpleInterval minimalInterval = kv.getValue().getOverlappingReferenceBases().getInterval();
ReferenceBases subset = readContextData.getOverlappingReferenceBases().getSubset(minimalInterval);
Assert.assertEquals(subset, kv.getValue().getOverlappingReferenceBases());
}
}
Aggregations