Search in sources :

Example 1 with ReadsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource in project gatk by broadinstitute.

the class MarkDuplicatesSparkUnitTest method markDupesTest.

@Test(dataProvider = "md", groups = "spark")
public void markDupesTest(final String input, final long totalExpected, final long dupsExpected) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> reads = readSource.getParallelReads(input, null);
    Assert.assertEquals(reads.count(), totalExpected);
    SAMFileHeader header = readSource.getHeader(input, null);
    OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection();
    final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
    JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1);
    Assert.assertEquals(markedReads.count(), totalExpected);
    JavaRDD<GATKRead> dupes = markedReads.filter(GATKRead::isDuplicate);
    Assert.assertEquals(dupes.count(), dupsExpected);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) OpticalDuplicatesArgumentCollection(org.broadinstitute.hellbender.cmdline.argumentcollections.OpticalDuplicatesArgumentCollection) OpticalDuplicateFinder(org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SAMFileHeader(htsjdk.samtools.SAMFileHeader) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 2 with ReadsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource in project gatk by broadinstitute.

the class BaseRecalibratorSparkSharded method runPipeline.

@Override
protected void runPipeline(JavaSparkContext ctx) {
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("Sorry, we only support a single reads input for now.");
    }
    final String bam = readArguments.getReadFilesNames().get(0);
    final String referenceURL = referenceArguments.getReferenceFileName();
    auth = getAuthHolder();
    final ReferenceMultiSource rds = new ReferenceMultiSource(auth, referenceURL, BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION);
    SAMFileHeader readsHeader = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency()).getHeader(bam, referenceURL);
    final SAMSequenceDictionary readsDictionary = readsHeader.getSequenceDictionary();
    final SAMSequenceDictionary refDictionary = rds.getReferenceSequenceDictionary(readsDictionary);
    final ReadFilter readFilterToApply = ReadFilter.fromList(BaseRecalibrator.getStandardBQSRReadFilterList(), readsHeader);
    SequenceDictionaryUtils.validateDictionaries("reference", refDictionary, "reads", readsDictionary);
    Broadcast<SAMFileHeader> readsHeaderBcast = ctx.broadcast(readsHeader);
    Broadcast<SAMSequenceDictionary> refDictionaryBcast = ctx.broadcast(refDictionary);
    List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary()) : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary());
    List<String> localVariants = knownVariants;
    localVariants = hackilyCopyFromGCSIfNecessary(localVariants);
    List<GATKVariant> variants = VariantsSource.getVariantsList(localVariants);
    // get reads, reference, variants
    JavaRDD<ContextShard> readsWithContext = AddContextDataToReadSparkOptimized.add(ctx, intervals, bam, variants, readFilterToApply, rds);
    // run BaseRecalibratorEngine.
    BaseRecalibratorEngineSparkWrapper recal = new BaseRecalibratorEngineSparkWrapper(readsHeaderBcast, refDictionaryBcast, bqsrArgs);
    JavaRDD<RecalibrationTables> tables = readsWithContext.mapPartitions(s -> recal.apply(s));
    final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(bqsrArgs, readsHeader));
    final RecalibrationTables table = tables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(tables.partitions().size()) / Math.log(2))));
    BaseRecalibrationEngine.finalizeRecalibrationTables(table);
    try {
        BaseRecalibratorEngineSparkWrapper.saveTextualReport(outputTablesPath, readsHeader, table, bqsrArgs, auth);
    } catch (IOException e) {
        throw new UserException.CouldNotCreateOutputFile(new File(outputTablesPath), e);
    }
}
Also used : ContextShard(org.broadinstitute.hellbender.engine.ContextShard) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) IOException(java.io.IOException) RecalibrationTables(org.broadinstitute.hellbender.utils.recalibration.RecalibrationTables) StandardCovariateList(org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseRecalibratorEngineSparkWrapper(org.broadinstitute.hellbender.tools.spark.transforms.bqsr.BaseRecalibratorEngineSparkWrapper) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Example 3 with ReadsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource in project gatk by broadinstitute.

the class GATKSparkTool method initializeReads.

/**
     * Initializes our reads source (but does not yet load the reads into a {@link JavaRDD}).
     * Does nothing if no reads inputs are present.
     */
private void initializeReads(final JavaSparkContext sparkContext) {
    if (readArguments.getReadFilesNames().isEmpty()) {
        return;
    }
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("Sorry, we only support a single reads input for spark tools for now.");
    }
    readInput = readArguments.getReadFilesNames().get(0);
    readsSource = new ReadsSparkSource(sparkContext, readArguments.getReadValidationStringency());
    readsHeader = readsSource.getHeader(readInput, hasReference() ? referenceArguments.getReferenceFile().getAbsolutePath() : null);
}
Also used : ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Example 4 with ReadsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource in project gatk by broadinstitute.

the class CompareDuplicatesSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    JavaRDD<GATKRead> firstReads = filteredReads(getReads(), readArguments.getReadFilesNames().get(0));
    ReadsSparkSource readsSource2 = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency());
    JavaRDD<GATKRead> secondReads = filteredReads(readsSource2.getParallelReads(input2, null, getIntervals(), bamPartitionSplitSize), input2);
    // Start by verifying that we have same number of reads and duplicates in each BAM.
    long firstBamSize = firstReads.count();
    long secondBamSize = secondReads.count();
    if (firstBamSize != secondBamSize) {
        throw new UserException("input bams have different numbers of mapped reads: " + firstBamSize + "," + secondBamSize);
    }
    System.out.println("processing bams with " + firstBamSize + " mapped reads");
    long firstDupesCount = firstReads.filter(GATKRead::isDuplicate).count();
    long secondDupesCount = secondReads.filter(GATKRead::isDuplicate).count();
    if (firstDupesCount != secondDupesCount) {
        System.out.println("BAMs have different number of total duplicates: " + firstDupesCount + "," + secondDupesCount);
    }
    System.out.println("first and second: " + firstDupesCount + "," + secondDupesCount);
    Broadcast<SAMFileHeader> bHeader = ctx.broadcast(getHeaderForReads());
    // Group the reads of each BAM by MarkDuplicates key, then pair up the the reads for each BAM.
    JavaPairRDD<String, GATKRead> firstKeyed = firstReads.mapToPair(read -> new Tuple2<>(ReadsKey.keyForFragment(bHeader.getValue(), read), read));
    JavaPairRDD<String, GATKRead> secondKeyed = secondReads.mapToPair(read -> new Tuple2<>(ReadsKey.keyForFragment(bHeader.getValue(), read), read));
    JavaPairRDD<String, Tuple2<Iterable<GATKRead>, Iterable<GATKRead>>> cogroup = firstKeyed.cogroup(secondKeyed, getRecommendedNumReducers());
    // Produces an RDD of MatchTypes, e.g., EQUAL, DIFFERENT_REPRESENTATIVE_READ, etc. per MarkDuplicates key,
    // which is approximately start position x strand.
    JavaRDD<MatchType> tagged = cogroup.map(v1 -> {
        SAMFileHeader header = bHeader.getValue();
        Iterable<GATKRead> iFirstReads = v1._2()._1();
        Iterable<GATKRead> iSecondReads = v1._2()._2();
        return getDupes(iFirstReads, iSecondReads, header);
    });
    // TODO: We should also produce examples of reads that don't match to make debugging easier (#1263).
    Map<MatchType, Integer> tagCountMap = tagged.mapToPair(v1 -> new Tuple2<>(v1, 1)).reduceByKey((v1, v2) -> v1 + v2).collectAsMap();
    if (tagCountMap.get(MatchType.SIZE_UNEQUAL) != null) {
        throw new UserException("The number of reads by the MarkDuplicates key were unequal, indicating that the BAMs are not the same");
    }
    if (tagCountMap.get(MatchType.READ_MISMATCH) != null) {
        throw new UserException("The reads grouped by the MarkDuplicates key were not the same, indicating that the BAMs are not the same");
    }
    if (printSummary) {
        MatchType[] values = MatchType.values();
        Set<MatchType> matchTypes = Sets.newLinkedHashSet(Sets.newHashSet(values));
        System.out.println("##############################");
        matchTypes.forEach(s -> System.out.println(s + ": " + tagCountMap.getOrDefault(s, 0)));
    }
    if (throwOnDiff) {
        for (MatchType s : MatchType.values()) {
            if (s != MatchType.EQUAL) {
                if (tagCountMap.get(s) != null)
                    throw new UserException("found difference between the two BAMs: " + s + " with count " + tagCountMap.get(s));
            }
        }
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Broadcast(org.apache.spark.broadcast.Broadcast) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) TestSparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.TestSparkProgramGroup) Argument(org.broadinstitute.barclay.argparser.Argument) ReadsKey(org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) ReadCoordinateComparator(org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator) Set(java.util.Set) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) SAMFileHeader(htsjdk.samtools.SAMFileHeader) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Sets(com.google.common.collect.Sets) ReadUtils(org.broadinstitute.hellbender.utils.read.ReadUtils) List(java.util.List) Lists(com.google.common.collect.Lists) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) UserException(org.broadinstitute.hellbender.exceptions.UserException) Map(java.util.Map) Function(org.apache.spark.api.java.function.Function) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) Tuple2(scala.Tuple2) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 5 with ReadsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource in project gatk by broadinstitute.

the class InsertSizeMetricsCollectorSparkUnitTest method test.

@Test(dataProvider = "metricsfiles", groups = "spark")
public void test(final String fileName, final String referenceName, final boolean allLevels, final String expectedResultsFile) throws IOException {
    final String inputPath = new File(TEST_DATA_DIR, fileName).getAbsolutePath();
    final String referencePath = referenceName != null ? new File(referenceName).getAbsolutePath() : null;
    final File outfile = BaseTest.createTempFile("test", ".insert_size_metrics");
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    ReadsSparkSource readSource = new ReadsSparkSource(ctx, ValidationStringency.DEFAULT_STRINGENCY);
    SAMFileHeader samHeader = readSource.getHeader(inputPath, referencePath);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputPath, referencePath);
    InsertSizeMetricsArgumentCollection isArgs = new InsertSizeMetricsArgumentCollection();
    isArgs.output = outfile.getAbsolutePath();
    if (allLevels) {
        isArgs.metricAccumulationLevel.accumulationLevels = new HashSet<>();
        isArgs.metricAccumulationLevel.accumulationLevels.add(MetricAccumulationLevel.ALL_READS);
        isArgs.metricAccumulationLevel.accumulationLevels.add(MetricAccumulationLevel.SAMPLE);
        isArgs.metricAccumulationLevel.accumulationLevels.add(MetricAccumulationLevel.LIBRARY);
        isArgs.metricAccumulationLevel.accumulationLevels.add(MetricAccumulationLevel.READ_GROUP);
    }
    InsertSizeMetricsCollectorSpark isSpark = new InsertSizeMetricsCollectorSpark();
    isSpark.initialize(isArgs, samHeader, null);
    // Since we're bypassing the framework in order to force this test to run on multiple partitions, we
    // need to make the read filter manually since we don't have the plugin descriptor to do it for us; so
    // remove the (default) FirstOfPairReadFilter filter and add in the SECOND_IN_PAIR manually since thats
    // required for our tests to pass
    List<ReadFilter> readFilters = isSpark.getDefaultReadFilters();
    readFilters.stream().filter(f -> !f.getClass().getSimpleName().equals(ReadFilterLibrary.FirstOfPairReadFilter.class.getSimpleName()));
    ReadFilter rf = ReadFilter.fromList(readFilters, samHeader);
    // Force the input RDD to be split into two partitions to ensure that the
    // reduce/combiners run
    rddParallelReads = rddParallelReads.repartition(2);
    isSpark.collectMetrics(rddParallelReads.filter(r -> rf.test(r)), samHeader);
    isSpark.saveMetrics(fileName, null);
    IntegrationTestSpec.assertEqualTextFiles(outfile, new File(TEST_DATA_DIR, expectedResultsFile), "#");
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.testng.annotations.Test) IOException(java.io.IOException) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) IntegrationTestSpec(org.broadinstitute.hellbender.utils.test.IntegrationTestSpec) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ValidationStringency(htsjdk.samtools.ValidationStringency) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) File(java.io.File) HashSet(java.util.HashSet) List(java.util.List) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) InsertSizeMetricsArgumentCollection(org.broadinstitute.hellbender.metrics.InsertSizeMetricsArgumentCollection) MetricAccumulationLevel(org.broadinstitute.hellbender.metrics.MetricAccumulationLevel) SparkContextFactory(org.broadinstitute.hellbender.engine.spark.SparkContextFactory) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) InsertSizeMetricsArgumentCollection(org.broadinstitute.hellbender.metrics.InsertSizeMetricsArgumentCollection) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest)

Aggregations

ReadsSparkSource (org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource)5 SAMFileHeader (htsjdk.samtools.SAMFileHeader)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)3 UserException (org.broadinstitute.hellbender.exceptions.UserException)3 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)3 File (java.io.File)2 IOException (java.io.IOException)2 List (java.util.List)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)2 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)2 Test (org.testng.annotations.Test)2 Lists (com.google.common.collect.Lists)1 Sets (com.google.common.collect.Sets)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1 ValidationStringency (htsjdk.samtools.ValidationStringency)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Set (java.util.Set)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1