use of scala.Tuple2 in project gatk-protected by broadinstitute.
the class HaplotypeCallerSpark method createReadShards.
/**
* Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
* @param shardBoundariesBroadcast broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
* @param reads Rdd of {@link GATKRead}
* @return a Rdd of reads grouped into potentially overlapping shards
*/
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
});
final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.
/**
* The Spark implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
/* local final member variables for lambda capture */
final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
targetBlocks.addAll(this.targetBlocks);
final List<Target> targetList = new ArrayList<>();
targetList.addAll(processedTargetList);
final List<String> sampleNameList = new ArrayList<>();
sampleNameList.addAll(processedSampleNameList);
final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
final int numTargetBlocks = targetBlocks.size();
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
/* make an RDD of copy ratio posterior expectations */
final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int si = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true);
/* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
/* step 1. update log chain posterior expectation on the driver node */
final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
/* step 2. repartition in target space */
final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
Collections::singletonList, /* recipe to add an element to the list */
(list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
(list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
/* we do not need copy ratio expectations anymore */
copyRatioPosteriorExpectationsPairRDD.unpersist();
/* step 3. merge with computeRDD and update */
computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
cacheWorkers("after E-step for copy ratio update");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class CoverageModelWPreconditionerSpark method operate.
@Override
public INDArray operate(@Nonnull final INDArray W_tl) throws DimensionMismatchException {
if (W_tl.rank() != 2 || W_tl.shape()[0] != numTargets || W_tl.shape()[1] != numLatents) {
throw new DimensionMismatchException(W_tl.length(), numTargets * numLatents);
}
long startTimeRFFT = System.nanoTime();
/* forward rfft */
final INDArray W_kl = Nd4j.create(fftSize, numLatents);
IntStream.range(0, numLatents).parallel().forEach(li -> W_kl.get(NDArrayIndex.all(), NDArrayIndex.point(li)).assign(Nd4j.create(F_tt.getForwardFFT(W_tl.get(NDArrayIndex.all(), NDArrayIndex.point(li))), new int[] { fftSize, 1 })));
long endTimeRFFT = System.nanoTime();
/* apply the preconditioner in the Fourier space */
long startTimePrecond = System.nanoTime();
final Map<LinearlySpacedIndexBlock, INDArray> W_kl_map = CoverageModelSparkUtils.partitionINDArrayToMap(fourierSpaceBlocks, W_kl);
final Broadcast<Map<LinearlySpacedIndexBlock, INDArray>> W_kl_bc = ctx.broadcast(W_kl_map);
final JavaPairRDD<LinearlySpacedIndexBlock, INDArray> preconditionedWRDD = linOpPairRDD.mapToPair(p -> {
final INDArray W_kl_chuck = W_kl_bc.value().get(p._1);
final INDArray linOp_chunk = p._2;
final int blockSize = linOp_chunk.shape()[0];
final List<INDArray> linOpWList = IntStream.range(0, blockSize).parallel().mapToObj(k -> CoverageModelEMWorkspaceMathUtils.linsolve(linOp_chunk.get(NDArrayIndex.point(k)), W_kl_chuck.get(NDArrayIndex.point(k)))).collect(Collectors.toList());
return new Tuple2<>(p._1, Nd4j.vstack(linOpWList));
});
W_kl.assign(CoverageModelSparkUtils.assembleINDArrayBlocksFromRDD(preconditionedWRDD, 0));
W_kl_bc.destroy();
// final JavaPairRDD<LinearlySpacedIndexBlock, INDArray> W_kl_RDD = CoverageModelSparkUtils.rddFromINDArray(W_kl,
// fourierSpaceBlocks, ctx, true);
// W_kl.assign(CoverageModelSparkUtils.assembleINDArrayBlocks(linOpPairRDD.join((W_kl_RDD))
// .mapValues(p -> {
// final INDArray linOp = p._1;
// final INDArray W = p._2;
// final int blockSize = linOp.shape()[0];
// final List<INDArray> linOpWList = IntStream.range(0, blockSize).parallel().mapToObj(k ->
// CoverageModelEMWorkspaceMathUtils.linsolve(linOp.get(NDArrayIndex.point(k)),
// W.get(NDArrayIndex.point(k))))
// .collect(Collectors.toList());
// return Nd4j.vstack(linOpWList);
// }), false));
// W_kl_RDD.unpersist();
long endTimePrecond = System.nanoTime();
/* irfft */
long startTimeIRFFT = System.nanoTime();
final INDArray res = Nd4j.create(numTargets, numLatents);
IntStream.range(0, numLatents).parallel().forEach(li -> res.get(NDArrayIndex.all(), NDArrayIndex.point(li)).assign(F_tt.getInverseFFT(W_kl.get(NDArrayIndex.all(), NDArrayIndex.point(li)))));
long endTimeIRFFT = System.nanoTime();
logger.debug("Local FFT timing: " + (endTimeRFFT - startTimeRFFT + endTimeIRFFT - startTimeIRFFT) / 1000000 + " ms");
logger.debug("Spark preconditioner application timing: " + (endTimePrecond - startTimePrecond) / 1000000 + " ms");
return res;
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method initializeWorkerPosteriors.
@UpdatesRDD
private void initializeWorkerPosteriors() {
/* make a local copy for lambda capture (these are small, so no broadcasting is necessary) */
final INDArray sampleMeanLogReadDepths = this.sampleMeanLogReadDepths;
final INDArray sampleVarLogReadDepths = this.sampleVarLogReadDepths;
final INDArray sampleUnexplainedVariance = this.sampleUnexplainedVariance;
final INDArray sampleBiasLatentPosteriorFirstMoments = this.sampleBiasLatentPosteriorFirstMoments;
final INDArray sampleBiasLatentPosteriorSecondMoments = this.sampleBiasLatentPosteriorSecondMoments;
/* calculate copy ratio prior expectations */
logger.info("Calculating copy ratio priors on the driver node...");
final List<CopyRatioExpectations> copyRatioPriorExpectationsList = sampleIndexStream().mapToObj(si -> copyRatioExpectationsCalculator.getCopyRatioPriorExpectations(CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(si)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(si)).sampleAverageMappingErrorProbability(config.getMappingErrorRate()).build(), processedTargetList)).collect(Collectors.toList());
/* update log chain posterior expectation */
sampleLogChainPosteriors.assign(Nd4j.create(copyRatioPriorExpectationsList.stream().mapToDouble(CopyRatioExpectations::getLogChainPosteriorProbability).toArray(), new int[] { numSamples, 1 }));
/* push per-target copy ratio expectations to workers */
final List<Tuple2<LinearlySpacedIndexBlock, Tuple2<INDArray, INDArray>>> copyRatioPriorsList = new ArrayList<>();
for (final LinearlySpacedIndexBlock tb : targetBlocks) {
final double[] logCopyRatioPriorMeansBlock = IntStream.range(0, tb.getNumElements()).mapToObj(rti -> copyRatioPriorExpectationsList.stream().mapToDouble(cre -> cre.getLogCopyRatioMeans()[rti + tb.getBegIndex()]).toArray()).flatMapToDouble(Arrays::stream).toArray();
final double[] logCopyRatioPriorVariancesBlock = IntStream.range(0, tb.getNumElements()).mapToObj(rti -> copyRatioPriorExpectationsList.stream().mapToDouble(cre -> cre.getLogCopyRatioVariances()[rti + tb.getBegIndex()]).toArray()).flatMapToDouble(Arrays::stream).toArray();
/* we do not need to take care of log copy ratio means and variances on masked targets here.
* potential NaNs will be rectified in the compute blocks by calling the method
* {@link CoverageModelEMComputeBlock#cloneWithInitializedData} */
copyRatioPriorsList.add(new Tuple2<>(tb, new Tuple2<INDArray, INDArray>(Nd4j.create(logCopyRatioPriorMeansBlock, new int[] { numSamples, tb.getNumElements() }, 'f'), Nd4j.create(logCopyRatioPriorVariancesBlock, new int[] { numSamples, tb.getNumElements() }, 'f'))));
}
/* push to compute blocks */
logger.info("Pushing posteriors to worker(s)...");
/* copy ratio priors */
joinWithWorkersAndMap(copyRatioPriorsList, p -> p._1.cloneWithUpdateCopyRatioPriors(p._2._1, p._2._2));
/* read depth and sample-specific unexplained variance */
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.log_d_s, sampleMeanLogReadDepths).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.var_log_d_s, sampleVarLogReadDepths).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.gamma_s, sampleUnexplainedVariance));
/* if bias covariates are enabled, initialize E[z] and E[z z^T] as well */
if (biasCovariatesEnabled) {
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.z_sl, sampleBiasLatentPosteriorFirstMoments).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.zz_sll, sampleBiasLatentPosteriorSecondMoments));
}
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class BroadcastJoinReadsWithVariants method join.
public static JavaPairRDD<GATKRead, Iterable<GATKVariant>> join(final JavaRDD<GATKRead> reads, final JavaRDD<GATKVariant> variants) {
final JavaSparkContext ctx = new JavaSparkContext(reads.context());
final IntervalsSkipList<GATKVariant> variantSkipList = new IntervalsSkipList<>(variants.collect());
final Broadcast<IntervalsSkipList<GATKVariant>> variantsBroadcast = ctx.broadcast(variantSkipList);
return reads.mapToPair(r -> {
final IntervalsSkipList<GATKVariant> intervalsSkipList = variantsBroadcast.getValue();
if (SimpleInterval.isValid(r.getContig(), r.getStart(), r.getEnd())) {
return new Tuple2<>(r, intervalsSkipList.getOverlapping(new SimpleInterval(r)));
} else {
return new Tuple2<>(r, Collections.emptyList());
}
});
}
Aggregations