use of org.apache.commons.math3.stat.descriptive.moment.Variance in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method initializeWorkerPosteriors.
@UpdatesRDD
private void initializeWorkerPosteriors() {
/* make a local copy for lambda capture (these are small, so no broadcasting is necessary) */
final INDArray sampleMeanLogReadDepths = this.sampleMeanLogReadDepths;
final INDArray sampleVarLogReadDepths = this.sampleVarLogReadDepths;
final INDArray sampleUnexplainedVariance = this.sampleUnexplainedVariance;
final INDArray sampleBiasLatentPosteriorFirstMoments = this.sampleBiasLatentPosteriorFirstMoments;
final INDArray sampleBiasLatentPosteriorSecondMoments = this.sampleBiasLatentPosteriorSecondMoments;
/* calculate copy ratio prior expectations */
logger.info("Calculating copy ratio priors on the driver node...");
final List<CopyRatioExpectations> copyRatioPriorExpectationsList = sampleIndexStream().mapToObj(si -> copyRatioExpectationsCalculator.getCopyRatioPriorExpectations(CopyRatioCallingMetadata.builder().sampleName(processedSampleNameList.get(si)).sampleSexGenotypeData(processedSampleSexGenotypeData.get(si)).sampleAverageMappingErrorProbability(config.getMappingErrorRate()).build(), processedTargetList)).collect(Collectors.toList());
/* update log chain posterior expectation */
sampleLogChainPosteriors.assign(Nd4j.create(copyRatioPriorExpectationsList.stream().mapToDouble(CopyRatioExpectations::getLogChainPosteriorProbability).toArray(), new int[] { numSamples, 1 }));
/* push per-target copy ratio expectations to workers */
final List<Tuple2<LinearlySpacedIndexBlock, Tuple2<INDArray, INDArray>>> copyRatioPriorsList = new ArrayList<>();
for (final LinearlySpacedIndexBlock tb : targetBlocks) {
final double[] logCopyRatioPriorMeansBlock = IntStream.range(0, tb.getNumElements()).mapToObj(rti -> copyRatioPriorExpectationsList.stream().mapToDouble(cre -> cre.getLogCopyRatioMeans()[rti + tb.getBegIndex()]).toArray()).flatMapToDouble(Arrays::stream).toArray();
final double[] logCopyRatioPriorVariancesBlock = IntStream.range(0, tb.getNumElements()).mapToObj(rti -> copyRatioPriorExpectationsList.stream().mapToDouble(cre -> cre.getLogCopyRatioVariances()[rti + tb.getBegIndex()]).toArray()).flatMapToDouble(Arrays::stream).toArray();
/* we do not need to take care of log copy ratio means and variances on masked targets here.
* potential NaNs will be rectified in the compute blocks by calling the method
* {@link CoverageModelEMComputeBlock#cloneWithInitializedData} */
copyRatioPriorsList.add(new Tuple2<>(tb, new Tuple2<INDArray, INDArray>(Nd4j.create(logCopyRatioPriorMeansBlock, new int[] { numSamples, tb.getNumElements() }, 'f'), Nd4j.create(logCopyRatioPriorVariancesBlock, new int[] { numSamples, tb.getNumElements() }, 'f'))));
}
/* push to compute blocks */
logger.info("Pushing posteriors to worker(s)...");
/* copy ratio priors */
joinWithWorkersAndMap(copyRatioPriorsList, p -> p._1.cloneWithUpdateCopyRatioPriors(p._2._1, p._2._2));
/* read depth and sample-specific unexplained variance */
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.log_d_s, sampleMeanLogReadDepths).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.var_log_d_s, sampleVarLogReadDepths).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.gamma_s, sampleUnexplainedVariance));
/* if bias covariates are enabled, initialize E[z] and E[z z^T] as well */
if (biasCovariatesEnabled) {
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.z_sl, sampleBiasLatentPosteriorFirstMoments).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.zz_sll, sampleBiasLatentPosteriorSecondMoments));
}
}
use of org.apache.commons.math3.stat.descriptive.moment.Variance in project gatk by broadinstitute.
the class CoverageModelParameters method generateRandomModel.
/**
* Generates random coverage model parameters.
*
* @param targetList list of targets
* @param numLatents number of latent variables
* @param seed random seed
* @param randomMeanLogBiasStandardDeviation std of mean log bias (mean is set to 0)
* @param randomBiasCovariatesStandardDeviation std of bias covariates (mean is set to 0)
* @param randomMaxUnexplainedVariance max value of unexplained variance (samples are taken from a uniform
* distribution [0, {@code randomMaxUnexplainedVariance}])
* @param initialBiasCovariatesARDCoefficients initial row vector of ARD coefficients
* @return an instance of {@link CoverageModelParameters}
*/
public static CoverageModelParameters generateRandomModel(final List<Target> targetList, final int numLatents, final long seed, final double randomMeanLogBiasStandardDeviation, final double randomBiasCovariatesStandardDeviation, final double randomMaxUnexplainedVariance, final INDArray initialBiasCovariatesARDCoefficients) {
Utils.validateArg(numLatents >= 0, "Dimension of the bias space must be non-negative");
Utils.validateArg(randomBiasCovariatesStandardDeviation >= 0, "Standard deviation of random bias covariates" + " must be non-negative");
Utils.validateArg(randomMeanLogBiasStandardDeviation >= 0, "Standard deviation of random mean log bias" + " must be non-negative");
Utils.validateArg(randomMaxUnexplainedVariance >= 0, "Max random unexplained variance must be non-negative");
Utils.validateArg(initialBiasCovariatesARDCoefficients == null || numLatents > 0 && initialBiasCovariatesARDCoefficients.length() == numLatents, "If ARD is enabled, the dimension" + " of the bias latent space must be positive and match the length of ARD coeffecient vector");
final boolean biasCovariatesEnabled = numLatents > 0;
final int numTargets = targetList.size();
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(seed));
/* Gaussian random for mean log bias */
final INDArray initialMeanLogBias = Nd4j.create(getNormalRandomNumbers(numTargets, 0, randomMeanLogBiasStandardDeviation, rng), new int[] { 1, numTargets });
/* Uniform random for unexplained variance */
final INDArray initialUnexplainedVariance = Nd4j.create(getUniformRandomNumbers(numTargets, 0, randomMaxUnexplainedVariance, rng), new int[] { 1, numTargets });
final INDArray initialMeanBiasCovariates;
if (biasCovariatesEnabled) {
/* Gaussian random for bias covariates */
initialMeanBiasCovariates = Nd4j.create(getNormalRandomNumbers(numTargets * numLatents, 0, randomBiasCovariatesStandardDeviation, rng), new int[] { numTargets, numLatents });
} else {
initialMeanBiasCovariates = null;
}
return new CoverageModelParameters(targetList, initialMeanLogBias, initialUnexplainedVariance, initialMeanBiasCovariates, initialBiasCovariatesARDCoefficients);
}
use of org.apache.commons.math3.stat.descriptive.moment.Variance in project gatk by broadinstitute.
the class CoverageModelParameters method write.
/**
* Writes the model to disk.
*
* @param outputPath model output path
*/
public static void write(@Nonnull CoverageModelParameters model, @Nonnull final String outputPath) {
/* create output directory if it doesn't exist */
createOutputPath(Utils.nonNull(outputPath, "The output path string must be non-null"));
/* write targets list */
final File targetListFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_LIST_OUTPUT_FILE);
TargetWriter.writeTargetsToFile(targetListFile, model.getTargetList());
final List<String> targetNames = model.getTargetList().stream().map(Target::getName).collect(Collectors.toList());
/* write target mean bias to file */
final File targetMeanBiasFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_MEAN_LOG_BIAS_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getTargetMeanLogBias().transpose(), targetMeanBiasFile, MEAN_LOG_BIAS_MATRIX_NAME, targetNames, null);
/* write target unexplained variance to file */
final File targetUnexplainedVarianceFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_UNEXPLAINED_VARIANCE_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getTargetUnexplainedVariance().transpose(), targetUnexplainedVarianceFile, TARGET_UNEXPLAINED_VARIANCE_MATRIX_NAME, targetNames, null);
if (model.isBiasCovariatesEnabled()) {
/* write mean bias covariates to file */
final List<String> meanBiasCovariatesNames = IntStream.range(0, model.getNumLatents()).mapToObj(li -> String.format(BIAS_COVARIATE_COLUMN_NAME_FORMAT, li)).collect(Collectors.toList());
final File meanBiasCovariatesFile = new File(outputPath, CoverageModelGlobalConstants.MEAN_BIAS_COVARIATES_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getMeanBiasCovariates(), meanBiasCovariatesFile, MEAN_BIAS_COVARIATES_MATRIX_NAME, targetNames, meanBiasCovariatesNames);
/* write norm_2 of mean bias covariates to file */
final INDArray WTW = model.getMeanBiasCovariates().transpose().mmul(model.getMeanBiasCovariates());
final double[] biasCovariatesNorm2 = IntStream.range(0, model.getNumLatents()).mapToDouble(li -> WTW.getDouble(li, li)).toArray();
final File biasCovariatesNorm2File = new File(outputPath, CoverageModelGlobalConstants.MEAN_BIAS_COVARIATES_NORM2_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(Nd4j.create(biasCovariatesNorm2, new int[] { 1, model.getNumLatents() }), biasCovariatesNorm2File, MEAN_BIAS_COVARIATES_NORM_2_MATRIX_NAME, null, meanBiasCovariatesNames);
/* if ARD is enabled, write the ARD coefficients and covariance of W as well */
if (model.isARDEnabled()) {
final File biasCovariatesARDCoefficientsFile = new File(outputPath, CoverageModelGlobalConstants.BIAS_COVARIATES_ARD_COEFFICIENTS_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getBiasCovariateARDCoefficients(), biasCovariatesARDCoefficientsFile, BIAS_COVARIATES_ARD_COEFFICIENTS_MATRIX_NAME, null, meanBiasCovariatesNames);
}
}
}
use of org.apache.commons.math3.stat.descriptive.moment.Variance in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method initializeWorkersWithPCA.
/**
* Initialize model parameters by performing PCA.
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private void initializeWorkersWithPCA() {
logger.info("Initializing model parameters using PCA...");
/* initially, set m_t, Psi_t and W_tl to zero to get an estimate of the read depth */
final int numLatents = config.getNumLatents();
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.m_t, Nd4j.zeros(new int[] { 1, cb.getTargetSpaceBlock().getNumElements() })).cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.Psi_t, Nd4j.zeros(new int[] { 1, cb.getTargetSpaceBlock().getNumElements() })));
if (biasCovariatesEnabled) {
mapWorkers(cb -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.W_tl, Nd4j.zeros(new int[] { cb.getTargetSpaceBlock().getNumElements(), numLatents })));
}
/* update read depth without taking into account correction from bias covariates */
updateReadDepthPosteriorExpectations(1.0, true);
/* fetch sample covariance matrix */
final int minPCAInitializationReadCount = config.getMinPCAInitializationReadCount();
mapWorkers(cb -> cb.cloneWithPCAInitializationData(minPCAInitializationReadCount, Integer.MAX_VALUE));
cacheWorkers("PCA initialization");
final INDArray targetCovarianceMatrix = mapWorkersAndReduce(CoverageModelEMComputeBlock::calculateTargetCovarianceMatrixForPCAInitialization, INDArray::add);
/* perform eigen-decomposition on the target covariance matrix */
final ImmutablePair<INDArray, INDArray> targetCovarianceEigensystem = CoverageModelEMWorkspaceMathUtils.eig(targetCovarianceMatrix, false, logger);
/* the eigenvalues of sample covariance matrix can be immediately inferred by scaling */
final INDArray sampleCovarianceEigenvalues = targetCovarianceEigensystem.getLeft().div(numSamples);
/* estimate the isotropic unexplained variance -- see Bishop 12.46 */
final int residualDim = numTargets - numLatents;
final double isotropicVariance = sampleCovarianceEigenvalues.get(NDArrayIndex.interval(numLatents, numSamples)).sumNumber().doubleValue() / residualDim;
logger.info(String.format("PCA estimate of isotropic unexplained variance: %f", isotropicVariance));
/* estimate bias factors -- see Bishop 12.45 */
final INDArray scaleFactors = Transforms.sqrt(sampleCovarianceEigenvalues.get(NDArrayIndex.interval(0, numLatents)).sub(isotropicVariance), false);
final INDArray biasCovariatesPCA = Nd4j.create(new int[] { numTargets, numLatents });
for (int li = 0; li < numLatents; li++) {
final INDArray v = targetCovarianceEigensystem.getRight().getColumn(li);
/* calculate [Delta_PCA_st]^T v */
/* note: we do not need to broadcast vec since it is small and lambda capture is just fine */
final INDArray unnormedBiasCovariate = CoverageModelSparkUtils.assembleINDArrayBlocksFromCollection(mapWorkersAndCollect(cb -> ImmutablePair.of(cb.getTargetSpaceBlock(), cb.getINDArrayFromCache(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.Delta_PCA_st).transpose().mmul(v))), 0);
final double norm = unnormedBiasCovariate.norm1Number().doubleValue();
final INDArray normedBiasCovariate = unnormedBiasCovariate.divi(norm).muli(scaleFactors.getDouble(li));
biasCovariatesPCA.getColumn(li).assign(normedBiasCovariate);
}
if (ardEnabled) {
/* a better estimate of ARD coefficients */
biasCovariatesARDCoefficients.assign(Nd4j.zeros(new int[] { 1, numLatents }).addi(config.getInitialARDPrecisionRelativeToNoise() / isotropicVariance));
}
final CoverageModelParameters modelParamsFromPCA = new CoverageModelParameters(processedTargetList, Nd4j.zeros(new int[] { 1, numTargets }), Nd4j.zeros(new int[] { 1, numTargets }).addi(isotropicVariance), biasCovariatesPCA, biasCovariatesARDCoefficients);
/* clear PCA initialization data from workers */
mapWorkers(CoverageModelEMComputeBlock::cloneWithRemovedPCAInitializationData);
/* push model parameters to workers */
initializeWorkersWithGivenModel(modelParamsFromPCA);
/* update bias latent posterior expectations without admixing */
updateBiasLatentPosteriorExpectations(1.0);
}
use of org.apache.commons.math3.stat.descriptive.moment.Variance in project gatk-protected by broadinstitute.
the class CoverageModelParameters method write.
/**
* Writes the model to disk.
*
* @param outputPath model output path
*/
public static void write(@Nonnull CoverageModelParameters model, @Nonnull final String outputPath) {
/* create output directory if it doesn't exist */
createOutputPath(Utils.nonNull(outputPath, "The output path string must be non-null"));
/* write targets list */
final File targetListFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_LIST_OUTPUT_FILE);
TargetWriter.writeTargetsToFile(targetListFile, model.getTargetList());
final List<String> targetNames = model.getTargetList().stream().map(Target::getName).collect(Collectors.toList());
/* write target mean bias to file */
final File targetMeanBiasFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_MEAN_LOG_BIAS_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getTargetMeanLogBias().transpose(), targetMeanBiasFile, MEAN_LOG_BIAS_MATRIX_NAME, targetNames, null);
/* write target unexplained variance to file */
final File targetUnexplainedVarianceFile = new File(outputPath, CoverageModelGlobalConstants.TARGET_UNEXPLAINED_VARIANCE_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getTargetUnexplainedVariance().transpose(), targetUnexplainedVarianceFile, TARGET_UNEXPLAINED_VARIANCE_MATRIX_NAME, targetNames, null);
if (model.isBiasCovariatesEnabled()) {
/* write mean bias covariates to file */
final List<String> meanBiasCovariatesNames = IntStream.range(0, model.getNumLatents()).mapToObj(li -> String.format(BIAS_COVARIATE_COLUMN_NAME_FORMAT, li)).collect(Collectors.toList());
final File meanBiasCovariatesFile = new File(outputPath, CoverageModelGlobalConstants.MEAN_BIAS_COVARIATES_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getMeanBiasCovariates(), meanBiasCovariatesFile, MEAN_BIAS_COVARIATES_MATRIX_NAME, targetNames, meanBiasCovariatesNames);
/* write norm_2 of mean bias covariates to file */
final INDArray WTW = model.getMeanBiasCovariates().transpose().mmul(model.getMeanBiasCovariates());
final double[] biasCovariatesNorm2 = IntStream.range(0, model.getNumLatents()).mapToDouble(li -> WTW.getDouble(li, li)).toArray();
final File biasCovariatesNorm2File = new File(outputPath, CoverageModelGlobalConstants.MEAN_BIAS_COVARIATES_NORM2_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(Nd4j.create(biasCovariatesNorm2, new int[] { 1, model.getNumLatents() }), biasCovariatesNorm2File, MEAN_BIAS_COVARIATES_NORM_2_MATRIX_NAME, null, meanBiasCovariatesNames);
/* if ARD is enabled, write the ARD coefficients and covariance of W as well */
if (model.isARDEnabled()) {
final File biasCovariatesARDCoefficientsFile = new File(outputPath, CoverageModelGlobalConstants.BIAS_COVARIATES_ARD_COEFFICIENTS_OUTPUT_FILE);
Nd4jIOUtils.writeNDArrayMatrixToTextFile(model.getBiasCovariateARDCoefficients(), biasCovariatesARDCoefficientsFile, BIAS_COVARIATES_ARD_COEFFICIENTS_MATRIX_NAME, null, meanBiasCovariatesNames);
}
}
}
Aggregations