use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class PCATangentNormalizationUtils method tangentNormalizeSpark.
/**
* Tangent normalize given the raw PoN data using Spark: the code here is a little more complex for optimization purposes.
*
* Please see notes in docs/PoN ...
*
* Ahat^T = (C^T P^T) A^T
* Therefore, C^T is the RowMatrix
*
* pinv: P
* panel: A
* projection: Ahat
* cases: C
* betahat: C^T P^T
* tangentNormalizedCounts: C - Ahat
*/
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) {
// Make the C^T a distributed matrix (RowMatrix)
final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);
// Spark local matrices (transposed)
final Matrix pinvTLocalMat = new DenseMatrix(reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
final Matrix panelTLocalMat = new DenseMatrix(reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose();
// Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();
// Subtract the projection from the cases
final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);
// Construct the result object and return it with the correct targets.
final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(betahatDistMat, tangentNormalizedCounts.getColumnDimension());
return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class PCATangentNormalizationUtils method tangentNormalizeNonSpark.
/**
* Tangent normalize given the raw PoN data without using Spark.
*/
private static PCATangentNormalizationResult tangentNormalizeNonSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts) {
// Calculate the beta-hats for the input read count columns (samples).
logger.info("Calculating beta hats...");
final RealMatrix tangentBetaHats = calculateBetaHats(reducedPanelPInvCounts, tangentNormalizationInputCounts, EPSILON);
// Actual tangent normalization step.
logger.info("Performing actual tangent normalization (" + tangentNormalizationInputCounts.getColumnDimension() + " columns)...");
final RealMatrix tangentNormalizedCounts = tangentNormalize(reducedPanelCounts, tangentNormalizationInputCounts, tangentBetaHats);
// Output the tangent normalized counts.
logger.info("Post-processing tangent normalization results...");
final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubtractMedianOfMedians.
@Test(dataProvider = "readCountOnlyData")
public void testSubtractMedianOfMedians(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double center = median.evaluate(columnMedians);
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] -= center;
}
}
HDF5PCACoveragePoNCreationUtils.subtractMedianOfMedians(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method simpleEigensampleData.
@DataProvider(name = "singleEigensample")
public Object[][] simpleEigensampleData() {
final List<Object[]> result = new ArrayList<>();
final int NUM_TARGETS = 10;
final int NUM_SAMPLES = 5;
final List<Target> targets = IntStream.range(0, NUM_TARGETS).boxed().map(i -> new Target("target_" + i, new SimpleInterval("1", 100 * i + 1, 100 * i + 5))).collect(Collectors.toList());
final List<String> columnNames = IntStream.range(0, NUM_SAMPLES).boxed().map(i -> "sample_" + i).collect(Collectors.toList());
double[][] countsArray = new double[NUM_TARGETS][NUM_SAMPLES];
final RealMatrix counts = new Array2DRowRealMatrix(countsArray);
// All row data is the same (0,1,2,3,4...)
final double[] rowData = IntStream.range(0, NUM_SAMPLES).boxed().mapToDouble(i -> i).toArray();
for (int i = 0; i < NUM_TARGETS; i++) {
counts.setRow(i, rowData);
}
new ReadCountCollection(targets, columnNames, counts);
result.add(new Object[] { new ReadCountCollection(targets, columnNames, counts) });
return result.toArray(new Object[result.size()][]);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubtractMedianOfMedians.
@Test(dataProvider = "readCountOnlyData")
public void testSubtractMedianOfMedians(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double center = median.evaluate(columnMedians);
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] -= center;
}
}
HDF5PCACoveragePoNCreationUtils.subtractMedianOfMedians(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
Aggregations