Search in sources :

Example 1 with ReadContextData

use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.

the class BaseRecalibratorSpark method runTool.

@Override
protected void runTool(JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    JavaRDD<GATKRead> initialReads = getReads();
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(knownVariants, getIntervals());
    // TODO: Look into broadcasting the reference to all of the workers. This would make AddContextDataToReadSpark
    // TODO: and ApplyBQSRStub simpler (#855).
    JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, initialReads, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    // TODO: broadcast the reads header?
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
    try (final PrintStream reportStream = new PrintStream(BucketUtils.createFile(outputTablesPath))) {
        RecalUtils.outputRecalibrationReport(reportStream, bqsrArgs, bqsrReport.getQuantizationInfo(), bqsrReport.getRecalibrationTables(), bqsrReport.getCovariates());
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) PrintStream(java.io.PrintStream) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 2 with ReadContextData

use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.

the class ReadsPipelineSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    //TOOO: should this use getUnfilteredReads? getReads will apply default and command line filters
    final JavaRDD<GATKRead> initialReads = getReads();
    final JavaRDD<GATKRead> markedReadsWithOD = MarkDuplicatesSpark.mark(initialReads, getHeaderForReads(), duplicatesScoringStrategy, new OpticalDuplicateFinder(), getRecommendedNumReducers());
    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.cleanupTemporaryAttributes(markedReadsWithOD);
    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> markedFilteredReadsForBQSR = markedReads.filter(read -> bqsrReadFilter.test(read));
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
    JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, markedFilteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(markedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
    writeReads(ctx, output, finalReads);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) OpticalDuplicateFinder(org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 3 with ReadContextData

use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.

the class BaseRecalibratorSparkFn method apply.

public static RecalibrationReport apply(final JavaPairRDD<GATKRead, ReadContextData> readsWithContext, final SAMFileHeader header, final SAMSequenceDictionary referenceDictionary, final RecalibrationArgumentCollection recalArgs) {
    JavaRDD<RecalibrationTables> unmergedTables = readsWithContext.mapPartitions(readWithContextIterator -> {
        final BaseRecalibrationEngine bqsr = new BaseRecalibrationEngine(recalArgs, header);
        bqsr.logCovariatesUsed();
        while (readWithContextIterator.hasNext()) {
            final Tuple2<GATKRead, ReadContextData> readWithData = readWithContextIterator.next();
            Iterable<GATKVariant> variants = readWithData._2().getOverlappingVariants();
            final ReferenceBases refBases = readWithData._2().getOverlappingReferenceBases();
            ReferenceDataSource refDS = new ReferenceMemorySource(refBases, referenceDictionary);
            bqsr.processRead(readWithData._1(), refDS, variants);
        }
        return Arrays.asList(bqsr.getRecalibrationTables()).iterator();
    });
    final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(recalArgs, header));
    final RecalibrationTables combinedTables = unmergedTables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(unmergedTables.partitions().size()) / Math.log(2))));
    BaseRecalibrationEngine.finalizeRecalibrationTables(combinedTables);
    final QuantizationInfo quantizationInfo = new QuantizationInfo(combinedTables, recalArgs.QUANTIZING_LEVELS);
    final StandardCovariateList covariates = new StandardCovariateList(recalArgs, header);
    return RecalUtils.createRecalibrationReport(recalArgs.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), RecalUtils.generateReportTables(combinedTables, covariates));
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceDataSource(org.broadinstitute.hellbender.engine.ReferenceDataSource) ReferenceMemorySource(org.broadinstitute.hellbender.engine.ReferenceMemorySource) StandardCovariateList(org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases)

Example 4 with ReadContextData

use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.

the class AddContextDataToReadSparkUnitTest method addContextDataTest.

@Test(dataProvider = "bases", groups = "spark")
public void addContextDataTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, ReadContextData>> expectedReadContextData, JoinStrategy joinStrategy) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    when(mockSource.getReferenceBases(any(PipelineOptions.class), any())).then(new ReferenceBasesAnswer());
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SAMSequenceDictionary sd = new SAMSequenceDictionary(Lists.newArrayList(new SAMSequenceRecord("1", 100000), new SAMSequenceRecord("2", 100000)));
    when(mockSource.getReferenceSequenceDictionary(null)).thenReturn(sd);
    JavaPairRDD<GATKRead, ReadContextData> rddActual = AddContextDataToReadSpark.add(ctx, rddReads, mockSource, rddVariants, joinStrategy, sd, 10000, 1000);
    Map<GATKRead, ReadContextData> actual = rddActual.collectAsMap();
    Assert.assertEquals(actual.size(), expectedReadContextData.size());
    for (KV<GATKRead, ReadContextData> kv : expectedReadContextData) {
        ReadContextData readContextData = actual.get(kv.getKey());
        Assert.assertNotNull(readContextData);
        Assert.assertTrue(CollectionUtils.isEqualCollection(Lists.newArrayList(readContextData.getOverlappingVariants()), Lists.newArrayList(kv.getValue().getOverlappingVariants())));
        SimpleInterval minimalInterval = kv.getValue().getOverlappingReferenceBases().getInterval();
        ReferenceBases subset = readContextData.getOverlappingReferenceBases().getSubset(minimalInterval);
        Assert.assertEquals(subset, kv.getValue().getOverlappingReferenceBases());
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 5 with ReadContextData

use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.

the class AddContextDataToReadSparkOptimized method fillContext.

/**
     * Given a shard that has reads and variants, query Google Genomics' Reference server and get reference info
     * (including an extra margin on either side), and fill that and the correct variants into readContext.
     */
public static ContextShard fillContext(ReferenceMultiSource refSource, ContextShard shard) {
    if (null == shard)
        return null;
    // use the function to make sure we get the exact correct amount of reference bases
    int start = Integer.MAX_VALUE;
    int end = Integer.MIN_VALUE;
    SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction = refSource.getReferenceWindowFunction();
    for (GATKRead r : shard.reads) {
        SimpleInterval readRefs = referenceWindowFunction.apply(r);
        start = Math.min(readRefs.getStart(), start);
        end = Math.max(readRefs.getEnd(), end);
    }
    if (start == Integer.MAX_VALUE) {
        // there are no reads in this shard, so we're going to remove it
        return null;
    }
    SimpleInterval refInterval = new SimpleInterval(shard.interval.getContig(), start, end);
    ReferenceBases refBases;
    try {
        refBases = refSource.getReferenceBases(null, refInterval);
    } catch (IOException x) {
        throw new GATKException("Unable to read the reference");
    }
    ArrayList<ReadContextData> readContext = new ArrayList<>();
    for (GATKRead r : shard.reads) {
        SimpleInterval readInterval = new SimpleInterval(r);
        List<GATKVariant> variantsOverlappingThisRead = shard.variantsOverlapping(readInterval);
        // we pass all the bases. That's better because this way it's just a shared
        // pointer instead of being an array copy. Downstream processing is fine with having
        // extra bases (it expects a few, actually).
        readContext.add(new ReadContextData(refBases, variantsOverlappingThisRead));
    }
    return shard.withReadContext(readContext);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) IOException(java.io.IOException) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Aggregations

ReadContextData (org.broadinstitute.hellbender.engine.ReadContextData)7 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)7 GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)7 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)4 VariantsSparkSource (org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource)3 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)3 RecalibrationReport (org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)3 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)2 ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)2 PipelineOptions (com.google.cloud.dataflow.sdk.options.PipelineOptions)1 Function (com.google.common.base.Function)1 Iterators (com.google.common.collect.Iterators)1 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)1 IOException (java.io.IOException)1 PrintStream (java.io.PrintStream)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 Iterator (java.util.Iterator)1