use of org.apache.spark.broadcast.Broadcast in project gatk-protected by broadinstitute.
the class HaplotypeCallerSpark method createReadShards.
/**
* Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
* @param shardBoundariesBroadcast broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
* @param reads Rdd of {@link GATKRead}
* @return a Rdd of reads grouped into potentially overlapping shards
*/
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
});
final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
use of org.apache.spark.broadcast.Broadcast in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsLocal.
/**
* Local implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size (key: "error_norm")
*/
public SubroutineSignal updateCopyRatioPosteriorExpectationsLocal(final double admixingRatio) {
/* step 1. fetch copy ratio emission data */
final List<List<CoverageModelCopyRatioEmissionData>> copyRatioEmissionData = fetchCopyRatioEmissionDataLocal();
/* step 2. run the forward-backward algorithm and calculate copy ratio posteriors */
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
final List<CopyRatioExpectations> copyRatioPosteriorResults = sampleIndexStream().parallel().mapToObj(si -> copyRatioExpectationsCalculator.getCopyRatioPosteriorExpectations(CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(si)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build(), processedTargetList, copyRatioEmissionData.get(si))).collect(Collectors.toList());
/* update log chain posterior expectation */
sampleLogChainPosteriors.assign(Nd4j.create(copyRatioPosteriorResults.stream().mapToDouble(CopyRatioExpectations::getLogChainPosteriorProbability).toArray(), new int[] { numSamples, 1 }));
/* sent the results back to workers */
final ImmutablePair<INDArray, INDArray> copyRatioPosteriorDataPair = convertCopyRatioLatentPosteriorExpectationsToNDArray(copyRatioPosteriorResults);
final INDArray log_c_st = copyRatioPosteriorDataPair.left;
final INDArray var_log_c_st = copyRatioPosteriorDataPair.right;
/* partition the pair of (log_c_st, var_log_c_st), sent the result to workers via broadcast-hash-map */
pushToWorkers(mapINDArrayPairToBlocks(log_c_st.transpose(), var_log_c_st.transpose()), (p, cb) -> cb.cloneWithUpdatedCopyRatioPosteriors(p.get(cb.getTargetSpaceBlock()).left.transpose(), p.get(cb.getTargetSpaceBlock()).right.transpose(), admixingRatio));
cacheWorkers("after E-step update of copy ratio posteriors");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of org.apache.spark.broadcast.Broadcast in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsLocal.
/**
* Local implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size (key: "error_norm")
*/
public SubroutineSignal updateCopyRatioPosteriorExpectationsLocal(final double admixingRatio) {
/* step 1. fetch copy ratio emission data */
final List<List<CoverageModelCopyRatioEmissionData>> copyRatioEmissionData = fetchCopyRatioEmissionDataLocal();
/* step 2. run the forward-backward algorithm and calculate copy ratio posteriors */
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
final List<CopyRatioExpectations> copyRatioPosteriorResults = sampleIndexStream().parallel().mapToObj(si -> copyRatioExpectationsCalculator.getCopyRatioPosteriorExpectations(CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(si)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build(), processedTargetList, copyRatioEmissionData.get(si))).collect(Collectors.toList());
/* update log chain posterior expectation */
sampleLogChainPosteriors.assign(Nd4j.create(copyRatioPosteriorResults.stream().mapToDouble(CopyRatioExpectations::getLogChainPosteriorProbability).toArray(), new int[] { numSamples, 1 }));
/* sent the results back to workers */
final ImmutablePair<INDArray, INDArray> copyRatioPosteriorDataPair = convertCopyRatioLatentPosteriorExpectationsToNDArray(copyRatioPosteriorResults);
final INDArray log_c_st = copyRatioPosteriorDataPair.left;
final INDArray var_log_c_st = copyRatioPosteriorDataPair.right;
/* partition the pair of (log_c_st, var_log_c_st), sent the result to workers via broadcast-hash-map */
pushToWorkers(mapINDArrayPairToBlocks(log_c_st.transpose(), var_log_c_st.transpose()), (p, cb) -> cb.cloneWithUpdatedCopyRatioPosteriors(p.get(cb.getTargetSpaceBlock()).left.transpose(), p.get(cb.getTargetSpaceBlock()).right.transpose(), admixingRatio));
cacheWorkers("after E-step update of copy ratio posteriors");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of org.apache.spark.broadcast.Broadcast in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method initializeWorkersWithPCA.
/**
* Initialize model parameters by performing PCA.
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private void initializeWorkersWithPCA() {
logger.info("Initializing model parameters using PCA...");
/* initially, set m_t, Psi_t and W_tl to zero to get an estimate of the read depth */
final int numLatents = config.getNumLatents();
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.m_t, Nd4j.zeros(new int[] { 1, cb.getTargetSpaceBlock().getNumElements() })).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.Psi_t, Nd4j.zeros(new int[] { 1, cb.getTargetSpaceBlock().getNumElements() })));
if (biasCovariatesEnabled) {
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.W_tl, Nd4j.zeros(new int[] { cb.getTargetSpaceBlock().getNumElements(), numLatents })));
}
/* update read depth without taking into account correction from bias covariates */
updateReadDepthPosteriorExpectations(1.0, true);
/* fetch sample covariance matrix */
final int minPCAInitializationReadCount = config.getMinPCAInitializationReadCount();
mapWorkers(cb -> cb.cloneWithPCAInitializationData(minPCAInitializationReadCount, Integer.MAX_VALUE));
cacheWorkers("PCA initialization");
final INDArray targetCovarianceMatrix = mapWorkersAndReduce(CoverageModelEMComputeBlock::calculateTargetCovarianceMatrixForPCAInitialization, INDArray::add);
/* perform eigen-decomposition on the target covariance matrix */
final ImmutablePair<INDArray, INDArray> targetCovarianceEigensystem = CoverageModelEMWorkspaceMathUtils.eig(targetCovarianceMatrix, false, logger);
/* the eigenvalues of sample covariance matrix can be immediately inferred by scaling */
final INDArray sampleCovarianceEigenvalues = targetCovarianceEigensystem.getLeft().div(numSamples);
/* estimate the isotropic unexplained variance -- see Bishop 12.46 */
final int residualDim = numTargets - numLatents;
final double isotropicVariance = sampleCovarianceEigenvalues.get(NDArrayIndex.interval(numLatents, numSamples)).sumNumber().doubleValue() / residualDim;
logger.info(String.format("PCA estimate of isotropic unexplained variance: %f", isotropicVariance));
/* estimate bias factors -- see Bishop 12.45 */
final INDArray scaleFactors = Transforms.sqrt(sampleCovarianceEigenvalues.get(NDArrayIndex.interval(0, numLatents)).sub(isotropicVariance), false);
final INDArray biasCovariatesPCA = Nd4j.create(new int[] { numTargets, numLatents });
for (int li = 0; li < numLatents; li++) {
final INDArray v = targetCovarianceEigensystem.getRight().getColumn(li);
/* calculate [Delta_PCA_st]^T v */
/* note: we do not need to broadcast vec since it is small and lambda capture is just fine */
final INDArray unnormedBiasCovariate = CoverageModelSparkUtils.assembleINDArrayBlocksFromCollection(mapWorkersAndCollect(cb -> ImmutablePair.of(cb.getTargetSpaceBlock(), cb.getINDArrayFromCache(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.Delta_PCA_st).transpose().mmul(v))), 0);
final double norm = unnormedBiasCovariate.norm1Number().doubleValue();
final INDArray normedBiasCovariate = unnormedBiasCovariate.divi(norm).muli(scaleFactors.getDouble(li));
biasCovariatesPCA.getColumn(li).assign(normedBiasCovariate);
}
if (ardEnabled) {
/* a better estimate of ARD coefficients */
biasCovariatesARDCoefficients.assign(Nd4j.zeros(new int[] { 1, numLatents }).addi(config.getInitialARDPrecisionRelativeToNoise() / isotropicVariance));
}
final CoverageModelParameters modelParamsFromPCA = new CoverageModelParameters(processedTargetList, Nd4j.zeros(new int[] { 1, numTargets }), Nd4j.zeros(new int[] { 1, numTargets }).addi(isotropicVariance), biasCovariatesPCA, biasCovariatesARDCoefficients);
/* clear PCA initialization data from workers */
mapWorkers(CoverageModelEMComputeBlock::cloneWithRemovedPCAInitializationData);
/* push model parameters to workers */
initializeWorkersWithGivenModel(modelParamsFromPCA);
/* update bias latent posterior expectations without admixing */
updateBiasLatentPosteriorExpectations(1.0);
}
use of org.apache.spark.broadcast.Broadcast in project gatk by broadinstitute.
the class CoverageModelWPreconditionerSpark method operate.
@Override
public INDArray operate(@Nonnull final INDArray W_tl) throws DimensionMismatchException {
if (W_tl.rank() != 2 || W_tl.shape()[0] != numTargets || W_tl.shape()[1] != numLatents) {
throw new DimensionMismatchException(W_tl.length(), numTargets * numLatents);
}
long startTimeRFFT = System.nanoTime();
/* forward rfft */
final INDArray W_kl = Nd4j.create(fftSize, numLatents);
IntStream.range(0, numLatents).parallel().forEach(li -> W_kl.get(NDArrayIndex.all(), NDArrayIndex.point(li)).assign(Nd4j.create(F_tt.getForwardFFT(W_tl.get(NDArrayIndex.all(), NDArrayIndex.point(li))), new int[] { fftSize, 1 })));
long endTimeRFFT = System.nanoTime();
/* apply the preconditioner in the Fourier space */
long startTimePrecond = System.nanoTime();
final Map<LinearlySpacedIndexBlock, INDArray> W_kl_map = CoverageModelSparkUtils.partitionINDArrayToMap(fourierSpaceBlocks, W_kl);
final Broadcast<Map<LinearlySpacedIndexBlock, INDArray>> W_kl_bc = ctx.broadcast(W_kl_map);
final JavaPairRDD<LinearlySpacedIndexBlock, INDArray> preconditionedWRDD = linOpPairRDD.mapToPair(p -> {
final INDArray W_kl_chuck = W_kl_bc.value().get(p._1);
final INDArray linOp_chunk = p._2;
final int blockSize = linOp_chunk.shape()[0];
final List<INDArray> linOpWList = IntStream.range(0, blockSize).parallel().mapToObj(k -> CoverageModelEMWorkspaceMathUtils.linsolve(linOp_chunk.get(NDArrayIndex.point(k)), W_kl_chuck.get(NDArrayIndex.point(k)))).collect(Collectors.toList());
return new Tuple2<>(p._1, Nd4j.vstack(linOpWList));
});
W_kl.assign(CoverageModelSparkUtils.assembleINDArrayBlocksFromRDD(preconditionedWRDD, 0));
W_kl_bc.destroy();
// final JavaPairRDD<LinearlySpacedIndexBlock, INDArray> W_kl_RDD = CoverageModelSparkUtils.rddFromINDArray(W_kl,
// fourierSpaceBlocks, ctx, true);
// W_kl.assign(CoverageModelSparkUtils.assembleINDArrayBlocks(linOpPairRDD.join((W_kl_RDD))
// .mapValues(p -> {
// final INDArray linOp = p._1;
// final INDArray W = p._2;
// final int blockSize = linOp.shape()[0];
// final List<INDArray> linOpWList = IntStream.range(0, blockSize).parallel().mapToObj(k ->
// CoverageModelEMWorkspaceMathUtils.linsolve(linOp.get(NDArrayIndex.point(k)),
// W.get(NDArrayIndex.point(k))))
// .collect(Collectors.toList());
// return Nd4j.vstack(linOpWList);
// }), false));
// W_kl_RDD.unpersist();
long endTimePrecond = System.nanoTime();
/* irfft */
long startTimeIRFFT = System.nanoTime();
final INDArray res = Nd4j.create(numTargets, numLatents);
IntStream.range(0, numLatents).parallel().forEach(li -> res.get(NDArrayIndex.all(), NDArrayIndex.point(li)).assign(F_tt.getInverseFFT(W_kl.get(NDArrayIndex.all(), NDArrayIndex.point(li)))));
long endTimeIRFFT = System.nanoTime();
logger.debug("Local FFT timing: " + (endTimeRFFT - startTimeRFFT + endTimeIRFFT - startTimeIRFFT) / 1000000 + " ms");
logger.debug("Spark preconditioner application timing: " + (endTimePrecond - startTimePrecond) / 1000000 + " ms");
return res;
}
Aggregations