use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method readCountOnlyWithDiverseShapeData.
@DataProvider(name = "readCountOnlyWithDiverseShapeData")
public Object[][] readCountOnlyWithDiverseShapeData() {
final List<Object[]> result = new ArrayList<>(4);
final Random rdn = new Random(31);
final int[] columnCounts = new int[] { 10, 100, 100, 200 };
final int[] targetCounts = new int[] { 100, 100, 200, 200 };
for (int k = 0; k < columnCounts.length; k++) {
final List<String> columnNames = IntStream.range(0, columnCounts[k]).mapToObj(i -> "sample_" + (i + 1)).collect(Collectors.toList());
final List<Target> targets = IntStream.range(0, targetCounts[k]).mapToObj(i -> new Target("target_" + (i + 1))).collect(Collectors.toList());
final double[][] counts = new double[targetCounts[k]][columnCounts[k]];
for (int i = 0; i < counts.length; i++) {
for (int j = 0; j < counts[0].length; j++) {
counts[i][j] = rdn.nextDouble();
}
}
final ReadCountCollection readCounts = new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(counts, false));
result.add(new Object[] { readCounts });
}
return result.toArray(new Object[result.size()][]);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class SparkGenomeReadCountsIntegrationTest method testSparkGenomeReadCountsInterval.
@Test
public void testSparkGenomeReadCountsInterval() {
final File outputFile = createTempFile(BAM_FILE.getName(), ".cov");
final String[] arguments = { "--disableSequenceDictionaryValidation", "-" + StandardArgumentDefinitions.REFERENCE_SHORT_NAME, REFERENCE_FILE.getAbsolutePath(), "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, BAM_FILE.getAbsolutePath(), "-" + SparkGenomeReadCounts.OUTPUT_FILE_SHORT_NAME, outputFile.getAbsolutePath(), "-" + SparkGenomeReadCounts.BINSIZE_SHORT_NAME, "10000", "-L", "1" };
runCommandLine(arguments);
final ReadCountCollection proportionalCoverage = loadReadCountCollection(outputFile);
Assert.assertTrue(proportionalCoverage.records().stream().noneMatch(t -> t.getContig().equals("2") || t.getContig().equals("3")));
// raw coverage
final ReadCountCollection rawCoverage = loadReadCountCollection(new File(outputFile.getAbsolutePath() + SparkGenomeReadCounts.RAW_COV_OUTPUT_EXTENSION));
Assert.assertTrue(rawCoverage.records().stream().noneMatch(t -> t.getContig().equals("2") || t.getContig().equals("3")));
final File targetsFile = new File(outputFile.getAbsolutePath() + ".targets.tsv");
final List<Target> targets = TargetTableReader.readTargetFile(targetsFile);
Assert.assertTrue(targets.stream().allMatch(t -> t.getContig().equals("1")));
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class PCATangentNormalizationUtilsUnitTest method normalizeReadCountByTargetFactorsData.
@DataProvider(name = "normalizeReadCountByTargetFactorsData")
public Object[][] normalizeReadCountByTargetFactorsData() {
final List<Object[]> result = new ArrayList<>(1);
@SuppressWarnings("serial") final List<Target> targets = new ArrayList<Target>() {
{
add(new Target("A"));
add(new Target("B"));
add(new Target("C"));
}
};
@SuppressWarnings("serial") final List<String> columnNames = new ArrayList<String>() {
{
add("1");
add("2");
add("3");
}
};
result.add(new Object[] { new ReadCountCollection(targets, columnNames, new Array2DRowRealMatrix(new double[][] { new double[] { 1.1, 2.2, 3.3 }, new double[] { 0.1, 0.2, 0.3 }, new double[] { 11.1, 22.2, 33.3 } }, false)), new double[] { 100.0, 200.0, 300.0 } });
return result.toArray(new Object[1][]);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk-protected by broadinstitute.
the class PCATangentNormalizationUtils method tangentNormalizeSpark.
/**
* Tangent normalize given the raw PoN data using Spark: the code here is a little more complex for optimization purposes.
*
* Please see notes in docs/PoN ...
*
* Ahat^T = (C^T P^T) A^T
* Therefore, C^T is the RowMatrix
*
* pinv: P
* panel: A
* projection: Ahat
* cases: C
* betahat: C^T P^T
* tangentNormalizedCounts: C - Ahat
*/
private static PCATangentNormalizationResult tangentNormalizeSpark(final ReadCountCollection targetFactorNormalizedCounts, final RealMatrix reducedPanelCounts, final RealMatrix reducedPanelPInvCounts, final CaseToPoNTargetMapper targetMapper, final RealMatrix tangentNormalizationInputCounts, final JavaSparkContext ctx) {
// Make the C^T a distributed matrix (RowMatrix)
final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix(ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);
// Spark local matrices (transposed)
final Matrix pinvTLocalMat = new DenseMatrix(reducedPanelPInvCounts.getRowDimension(), reducedPanelPInvCounts.getColumnDimension(), Doubles.concat(reducedPanelPInvCounts.getData()), true).transpose();
final Matrix panelTLocalMat = new DenseMatrix(reducedPanelCounts.getRowDimension(), reducedPanelCounts.getColumnDimension(), Doubles.concat(reducedPanelCounts.getData()), true).transpose();
// Calculate the projection transpose in a distributed matrix, then convert to Apache Commons matrix (not transposed)
final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix(projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()).transpose();
// Subtract the projection from the cases
final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection);
// Construct the result object and return it with the correct targets.
final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection(tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix(betahatDistMat, tangentNormalizedCounts.getColumnDimension());
return new PCATangentNormalizationResult(tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts);
}
Aggregations