Search in sources :

Example 76 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class HDF5PCACoveragePoNCreationUtilsUnitTest method testCalculateReducedPanelAndPInversesKeepingHalfOfAllColumns.

@Test(dataProvider = "readCountOnlyWithDiverseShapeData")
public void testCalculateReducedPanelAndPInversesKeepingHalfOfAllColumns(final ReadCountCollection readCounts) {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final ReductionResult result = HDF5PCACoveragePoNCreationUtils.calculateReducedPanelAndPInverses(readCounts, OptionalInt.of(readCounts.columnNames().size() / 2), NULL_LOGGER, ctx);
    final RealMatrix counts = readCounts.counts();
    Assert.assertNotNull(result);
    Assert.assertNotNull(result.getPseudoInverse());
    Assert.assertNotNull(result.getReducedCounts());
    Assert.assertNotNull(result.getReducedPseudoInverse());
    Assert.assertNotNull(result.getAllSingularValues());
    Assert.assertEquals(counts.getColumnDimension(), result.getAllSingularValues().length);
    Assert.assertEquals(result.getReducedCounts().getRowDimension(), counts.getRowDimension());
    Assert.assertEquals(result.getReducedCounts().getColumnDimension(), readCounts.columnNames().size() / 2);
    final int eigensamples = result.getReducedCounts().getColumnDimension();
    Assert.assertEquals(eigensamples, readCounts.columnNames().size() / 2);
    assertPseudoInverse(counts, result.getPseudoInverse());
    assertPseudoInverse(result.getReducedCounts(), result.getReducedPseudoInverse());
}
Also used : Array2DRowRealMatrix(org.apache.commons.math3.linear.Array2DRowRealMatrix) RealMatrix(org.apache.commons.math3.linear.RealMatrix) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 77 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.

the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsNoSuspiciousSamplesWithSpark.

@Test
public void testIdentifySamplesWithSuspiciousContigsNoSuspiciousSamplesWithSpark() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_NO_SUSPICIOUS_SAMPLES_FILE);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_NO_SUSPICIOUS_SAMPLES_FILE, ioe);
    }
    final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);
    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
    Assert.assertEquals(blacklistSamples.size(), 0);
}
Also used : ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IOException(java.io.IOException) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 78 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk-protected by broadinstitute.

the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsDelsWithSpark.

@Test
public void testIdentifySamplesWithSuspiciousContigsDelsWithSpark() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Set<String> gtBlacklistSamples = new HashSet<>();
    gtBlacklistSamples.add("sample_1");
    gtBlacklistSamples.add("sample_2");
    gtBlacklistSamples.add("sample_3");
    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_DEL);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_FILE_DEL, ioe);
    }
    final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);
    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
    final Set<String> resultSamples = new HashSet<>(blacklistSamples);
    Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
    Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}
Also used : ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IOException(java.io.IOException) HashSet(java.util.HashSet) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 79 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk-protected by broadinstitute.

the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsNoSuspiciousSamplesWithSpark.

@Test
public void testIdentifySamplesWithSuspiciousContigsNoSuspiciousSamplesWithSpark() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_NO_SUSPICIOUS_SAMPLES_FILE);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_NO_SUSPICIOUS_SAMPLES_FILE, ioe);
    }
    final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);
    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
    Assert.assertEquals(blacklistSamples.size(), 0);
}
Also used : ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IOException(java.io.IOException) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 80 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gatk-protected by broadinstitute.

the class CoveragePoNQCUtilsUnitTest method testIdentifySamplesWithSuspiciousContigsAmpsWithSpark.

@Test
public void testIdentifySamplesWithSuspiciousContigsAmpsWithSpark() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Set<String> gtBlacklistSamples = new HashSet<>();
    gtBlacklistSamples.add("sample_1");
    gtBlacklistSamples.add("sample_2");
    gtBlacklistSamples.add("sample_3");
    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_AMP);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_FILE_AMP, ioe);
    }
    final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);
    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));
    final Set<String> resultSamples = new HashSet<>(blacklistSamples);
    Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
    Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}
Also used : ReadCountCollection(org.broadinstitute.hellbender.tools.exome.ReadCountCollection) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IOException(java.io.IOException) HashSet(java.util.HashSet) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)260 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 SparkConf (org.apache.spark.SparkConf)49 Tuple2 (scala.Tuple2)48 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)43 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)28 Configuration (org.apache.hadoop.conf.Configuration)24 JavaRDD (org.apache.spark.api.java.JavaRDD)24 File (java.io.File)23 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)14 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 HashSet (java.util.HashSet)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12