Examples with VariantsSparkSource - org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource

Example 1 with VariantsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource in project gatk by broadinstitute.

the class BaseRecalibratorSpark method runTool.

@Override
protected void runTool(JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    JavaRDD<GATKRead> initialReads = getReads();
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(knownVariants, getIntervals());
    // TODO: Look into broadcasting the reference to all of the workers. This would make AddContextDataToReadSpark
    // TODO: and ApplyBQSRStub simpler (#855).
    JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, initialReads, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    // TODO: broadcast the reads header?
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
    try (final PrintStream reportStream = new PrintStream(BucketUtils.createFile(outputTablesPath))) {
        RecalUtils.outputRecalibrationReport(reportStream, bqsrArgs, bqsrReport.getQuantizationInfo(), bqsrReport.getRecalibrationTables(), bqsrReport.getCovariates());
    }
}

Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) PrintStream(java.io.PrintStream) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 2 with VariantsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource in project gatk by broadinstitute.

the class ReadsPipelineSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    //TOOO: should this use getUnfilteredReads? getReads will apply default and command line filters
    final JavaRDD<GATKRead> initialReads = getReads();
    final JavaRDD<GATKRead> markedReadsWithOD = MarkDuplicatesSpark.mark(initialReads, getHeaderForReads(), duplicatesScoringStrategy, new OpticalDuplicateFinder(), getRecommendedNumReducers());
    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.cleanupTemporaryAttributes(markedReadsWithOD);
    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> markedFilteredReadsForBQSR = markedReads.filter(read -> bqsrReadFilter.test(read));
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
    JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, markedFilteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(markedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
    writeReads(ctx, output, finalReads);
}

Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) OpticalDuplicateFinder(org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 3 with VariantsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource in project gatk by broadinstitute.

the class BQSRPipelineSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    //Should this get the getUnfilteredReads? getReads will merge default and command line filters.
    //but the code below uses other filters for other parts of the pipeline that do not honor
    //the commandline.
    final JavaRDD<GATKRead> initialReads = getReads();
    // The initial reads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
    // performed, so we do that here.
    //NOTE: this filter doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));
    final VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    final JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
    final JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, filteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    //note: we use the reference dictionary from the reads themselves.
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getHeaderForReads().getSequenceDictionary(), bqsrArgs);
    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
    writeReads(ctx, output, finalReads);
}

Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 4 with VariantsSparkSource

use of org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource in project gatk by broadinstitute.

the class CountVariantsSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    final VariantsSparkSource vss = new VariantsSparkSource(ctx);
    final JavaRDD<VariantContext> variants = vss.getParallelVariantContexts(input, getIntervals());
    final long count = variants.count();
    System.out.println(count);
    if (out != null) {
        try (final PrintStream ps = new PrintStream(BucketUtils.createFile(out))) {
            ps.print(count);
        }
    }
}

Also used : PrintStream(java.io.PrintStream) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource)

Aggregations

VariantsSparkSource (org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource)4 ReadContextData (org.broadinstitute.hellbender.engine.ReadContextData)3 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)3 RecalibrationReport (org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)3 GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)3 PrintStream (java.io.PrintStream)2 ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)2 VariantContext (htsjdk.variant.variantcontext.VariantContext)1 OpticalDuplicateFinder (org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder)1