use of org.apache.spark.mllib.linalg.distributed.RowMatrix in project gatk by broadinstitute.
the class SparkSingularValueDecomposer method createSVD.
/**
* Create a SVD of the given matrix using the given Java Spark Context.
*
* @param realMat the matrix target. Not {@code null}
* @return never {@code null}
*/
@Override
public SVD createSVD(final RealMatrix realMat) {
Utils.nonNull(realMat, "Cannot perform Spark MLLib SVD on a null matrix.");
final RowMatrix mat = SparkConverter.convertRealMatrixToSparkRowMatrix(sc, realMat, NUM_SLICES);
// Compute all of the singular values and corresponding singular vectors.
final SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD((int) mat.numCols(), true, 1.0E-9d);
// Get our distributed results
final RowMatrix u = svd.U();
final Vector s = svd.s();
final Matrix v = svd.V().transpose();
// Move the matrices from Spark/distributed space to Apache Commons space
logger.info("Converting distributed Spark matrix to local matrix...");
final RealMatrix uReal = SparkConverter.convertSparkRowMatrixToRealMatrix(u, realMat.getRowDimension());
logger.info("Done converting distributed Spark matrix to local matrix...");
logger.info("Converting Spark matrix to local matrix...");
final RealMatrix vReal = SparkConverter.convertSparkMatrixToRealMatrix(v);
logger.info("Done converting Spark matrix to local matrix...");
final double[] singularValues = s.toArray();
logger.info("Calculating the pseudoinverse...");
logger.info("Pinv: calculating tolerance...");
// Note that the pinv of realMat is V * invS * U'
final double tolerance = Math.max(realMat.getColumnDimension(), realMat.getRowDimension()) * realMat.getNorm() * EPS;
logger.info("Pinv: inverting the singular values (with tolerance) and creating a diagonal matrix...");
final double[] invS = Arrays.stream(singularValues).map(sv -> invertSVWithTolerance(sv, tolerance)).toArray();
final Matrix invSMat = Matrices.diag(Vectors.dense(invS));
logger.info("Pinv: Multiplying V * invS * U' to get the pinv (using pinv transpose = U * invS' * V') ...");
final RowMatrix pinvT = u.multiply(invSMat).multiply(v);
logger.info("Pinv: Converting back to local matrix ...");
final RealMatrix pinv = SparkConverter.convertSparkRowMatrixToRealMatrix(pinvT, realMat.getRowDimension()).transpose();
logger.info("Done calculating the pseudoinverse and converting it...");
return new SimpleSVD(uReal, s.toArray(), vReal, pinv);
}
use of org.apache.spark.mllib.linalg.distributed.RowMatrix in project gatk by broadinstitute.
the class PCATangentNormalizationUtils method tangentNormalizeSpark.
/**
* Tangent normalize given the raw PoN data using Spark: the code here is a little more complex for optimization purposes.
*
* Please see notes in docs/PoN ...
*
* Ahat^T = (C^T P^T) A^T
* Therefore, C^T is the RowMatrix
*
* pinv: P
* panel: A
* projection: Ahat
* cases: C
* betahat: C^T P^T
* tangentNormalizedCounts: C - Ahat
*/
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) {
// Make the C^T a distributed matrix (RowMatrix)
final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);
// Spark local matrices (transposed)
final Matrix pinvTLocalMat = new DenseMatrix(reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
final Matrix panelTLocalMat = new DenseMatrix(reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose();
// Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();
// Subtract the projection from the cases
final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);
// Construct the result object and return it with the correct targets.
final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(betahatDistMat, tangentNormalizedCounts.getColumnDimension());
return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
use of org.apache.spark.mllib.linalg.distributed.RowMatrix in project gatk-protected by broadinstitute.
the class SparkConverter method convertRealMatrixToSparkRowMatrix.
/**
* Create a distributed matrix given an Apache Commons RealMatrix.
*
* @param sc Never {@code null}
* @param realMat Apache Commons RealMatrix. Never {@code null}
* @return A distributed Spark matrix
*/
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
logger.info("Converting matrix to distributed Spark matrix...");
final double[][] dataArray = realMat.getData();
final LinkedList<Vector> rowsList = new LinkedList<>();
for (final double[] i : dataArray) {
final Vector currentRow = Vectors.dense(i);
rowsList.add(currentRow);
}
// We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
// final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
// // Want the partitions to be ~100KB of space
// final int slices = totalSpace/100000;
final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);
// Create a RowMatrix from JavaRDD<Vector>.
final RowMatrix mat = new RowMatrix(rows.rdd());
logger.info("Done converting matrix to distributed Spark matrix...");
return mat;
}
use of org.apache.spark.mllib.linalg.distributed.RowMatrix in project gatk-protected by broadinstitute.
the class PCATangentNormalizationUtils method tangentNormalizeSpark.
/**
* Tangent normalize given the raw PoN data using Spark: the code here is a little more complex for optimization purposes.
*
* Please see notes in docs/PoN ...
*
* Ahat^T = (C^T P^T) A^T
* Therefore, C^T is the RowMatrix
*
* pinv: P
* panel: A
* projection: Ahat
* cases: C
* betahat: C^T P^T
* tangentNormalizedCounts: C - Ahat
*/
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) {
// Make the C^T a distributed matrix (RowMatrix)
final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);
// Spark local matrices (transposed)
final Matrix pinvTLocalMat = new DenseMatrix(reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
final Matrix panelTLocalMat = new DenseMatrix(reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose();
// Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();
// Subtract the projection from the cases
final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);
// Construct the result object and return it with the correct targets.
final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(betahatDistMat, tangentNormalizedCounts.getColumnDimension());
return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
use of org.apache.spark.mllib.linalg.distributed.RowMatrix in project gatk by broadinstitute.
the class SparkConverter method convertRealMatrixToSparkRowMatrix.
/**
* Create a distributed matrix given an Apache Commons RealMatrix.
*
* @param sc Never {@code null}
* @param realMat Apache Commons RealMatrix. Never {@code null}
* @return A distributed Spark matrix
*/
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
logger.info("Converting matrix to distributed Spark matrix...");
final double[][] dataArray = realMat.getData();
final LinkedList<Vector> rowsList = new LinkedList<>();
for (final double[] i : dataArray) {
final Vector currentRow = Vectors.dense(i);
rowsList.add(currentRow);
}
// We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
// final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
// // Want the partitions to be ~100KB of space
// final int slices = totalSpace/100000;
final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);
// Create a RowMatrix from JavaRDD<Vector>.
final RowMatrix mat = new RowMatrix(rows.rdd());
logger.info("Done converting matrix to distributed Spark matrix...");
return mat;
}
Aggregations