use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsAmpsWithSpark.
@Test
public void testIdentifySamplesWithSuspiciousContigsAmpsWithSpark() {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final Set<String> gtBlacklistSamples = new HashSet<>();
gtBlacklistSamples.add("sample_1");
gtBlacklistSamples.add("sample_2");
gtBlacklistSamples.add("sample_3");
ReadCountCollection allCoverageProfiles = null;
try {
allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_AMP);
} catch (final IOException ioe) {
Assert.fail("Could not load test file: " + TEST_FILE_AMP, ioe);
}
final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);
// By the time we are here, input is assumed to have been tangent normalized.
final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
final Set<String> resultSamples = new HashSet<>(blacklistSamples);
Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsDels.
@Test
public void testIdentifySamplesWithSuspiciousContigsDels() {
final Set<String> gtBlacklistSamples = new HashSet<>();
gtBlacklistSamples.add("sample_1");
gtBlacklistSamples.add("sample_2");
gtBlacklistSamples.add("sample_3");
ReadCountCollection allCoverageProfiles = null;
try {
allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_DEL);
} catch (final IOException ioe) {
Assert.fail("Could not load test file: " + TEST_FILE_DEL, ioe);
}
final List<ReadCountCollection> singleSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createIndividualReadCountCollections(allCoverageProfiles);
// By the time we are here, input is assumed to have been tangent normalized.
final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(singleSampleTangentNormalizedReadCounts, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
final Set<String> resultSamples = new HashSet<>(blacklistSamples);
Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testNormalizeAndLogReadCounts.
@Test(dataProvider = "readCountOnlyData")
public void testNormalizeAndLogReadCounts(final ReadCountCollection readCounts) {
final RealMatrix counts = readCounts.counts();
final Median median = new Median();
final double[] columnMedians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(i -> median.evaluate(counts.getColumn(i))).toArray();
final double epsilon = HDF5PCACoveragePoNCreationUtils.EPSILON;
final double[][] expected = new double[counts.getRowDimension()][];
for (int i = 0; i < expected.length; i++) {
expected[i] = counts.getRow(i).clone();
for (int j = 0; j < expected[i].length; j++) {
expected[i][j] /= columnMedians[j];
if (expected[i][j] < epsilon) {
expected[i][j] = epsilon;
}
expected[i][j] = Math.log(expected[i][j]) / Math.log(2);
}
}
HDF5PCACoveragePoNCreationUtils.normalizeAndLogReadCounts(readCounts, NULL_LOGGER);
final RealMatrix newCounts = readCounts.counts();
Assert.assertEquals(newCounts.getColumnDimension(), expected[0].length);
Assert.assertEquals(newCounts.getRowDimension(), expected.length);
for (int i = 0; i < expected.length; i++) {
for (int j = 0; j < expected[i].length; j++) {
Assert.assertEquals(newCounts.getEntry(i, j), expected[i][j], 0.000001);
}
}
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method simpleEigensampleData.
@DataProvider(name = "singleEigensample")
public Object[][] simpleEigensampleData() {
final List<Object[]> result = new ArrayList<>();
final int NUM_TARGETS = 10;
final int NUM_SAMPLES = 5;
final List<Target> targets = IntStream.range(0, NUM_TARGETS).boxed().map(i -> new Target("target_" + i, new SimpleInterval("1", 100 * i + 1, 100 * i + 5))).collect(Collectors.toList());
final List<String> columnNames = IntStream.range(0, NUM_SAMPLES).boxed().map(i -> "sample_" + i).collect(Collectors.toList());
double[][] countsArray = new double[NUM_TARGETS][NUM_SAMPLES];
final RealMatrix counts = new Array2DRowRealMatrix(countsArray);
// All row data is the same (0,1,2,3,4...)
final double[] rowData = IntStream.range(0, NUM_SAMPLES).boxed().mapToDouble(i -> i).toArray();
for (int i = 0; i < NUM_TARGETS; i++) {
counts.setRow(i, rowData);
}
new ReadCountCollection(targets, columnNames, counts);
result.add(new Object[] { new ReadCountCollection(targets, columnNames, counts) });
return result.toArray(new Object[result.size()][]);
}
use of org.broadinstitute.hellbender.tools.exome.ReadCountCollection in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testSubsetTargetToUsableOnes.
@Test(dataProvider = "readCountAndPercentileData")
public void testSubsetTargetToUsableOnes(final ReadCountCollection readCount, final double percentile) {
final Median median = new Median();
final RealMatrix counts = readCount.counts();
final double[] targetMedians = IntStream.range(0, counts.getRowDimension()).mapToDouble(i -> median.evaluate(counts.getRow(i))).toArray();
final double threshold = new Percentile(percentile).evaluate(targetMedians);
final Boolean[] toBeKept = DoubleStream.of(targetMedians).mapToObj(d -> d >= threshold).toArray(Boolean[]::new);
final int toBeKeptCount = (int) Stream.of(toBeKept).filter(b -> b).count();
final Pair<ReadCountCollection, double[]> result = HDF5PCACoveragePoNCreationUtils.subsetReadCountsToUsableTargets(readCount, percentile, NULL_LOGGER);
Assert.assertEquals(result.getLeft().targets().size(), toBeKeptCount);
Assert.assertEquals(result.getRight().length, toBeKeptCount);
int nextIndex = 0;
for (int i = 0; i < toBeKept.length; i++) {
if (toBeKept[i]) {
int index = result.getLeft().targets().indexOf(readCount.targets().get(i));
Assert.assertEquals(index, nextIndex++);
Assert.assertEquals(counts.getRow(i), result.getLeft().counts().getRow(index));
Assert.assertEquals(result.getRight()[index], targetMedians[i]);
} else {
Assert.assertEquals(result.getLeft().targets().indexOf(readCount.targets().get(i)), -1);
}
}
}
Aggregations