Search in sources :

Example 1 with ContextShard

use of org.broadinstitute.hellbender.engine.ContextShard in project gatk by broadinstitute.

the class BaseRecalibratorSparkSharded method runPipeline.

@Override
protected void runPipeline(JavaSparkContext ctx) {
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("Sorry, we only support a single reads input for now.");
    }
    final String bam = readArguments.getReadFilesNames().get(0);
    final String referenceURL = referenceArguments.getReferenceFileName();
    auth = getAuthHolder();
    final ReferenceMultiSource rds = new ReferenceMultiSource(auth, referenceURL, BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION);
    SAMFileHeader readsHeader = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency()).getHeader(bam, referenceURL);
    final SAMSequenceDictionary readsDictionary = readsHeader.getSequenceDictionary();
    final SAMSequenceDictionary refDictionary = rds.getReferenceSequenceDictionary(readsDictionary);
    final ReadFilter readFilterToApply = ReadFilter.fromList(BaseRecalibrator.getStandardBQSRReadFilterList(), readsHeader);
    SequenceDictionaryUtils.validateDictionaries("reference", refDictionary, "reads", readsDictionary);
    Broadcast<SAMFileHeader> readsHeaderBcast = ctx.broadcast(readsHeader);
    Broadcast<SAMSequenceDictionary> refDictionaryBcast = ctx.broadcast(refDictionary);
    List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary()) : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary());
    List<String> localVariants = knownVariants;
    localVariants = hackilyCopyFromGCSIfNecessary(localVariants);
    List<GATKVariant> variants = VariantsSource.getVariantsList(localVariants);
    // get reads, reference, variants
    JavaRDD<ContextShard> readsWithContext = AddContextDataToReadSparkOptimized.add(ctx, intervals, bam, variants, readFilterToApply, rds);
    // run BaseRecalibratorEngine.
    BaseRecalibratorEngineSparkWrapper recal = new BaseRecalibratorEngineSparkWrapper(readsHeaderBcast, refDictionaryBcast, bqsrArgs);
    JavaRDD<RecalibrationTables> tables = readsWithContext.mapPartitions(s -> recal.apply(s));
    final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(bqsrArgs, readsHeader));
    final RecalibrationTables table = tables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(tables.partitions().size()) / Math.log(2))));
    BaseRecalibrationEngine.finalizeRecalibrationTables(table);
    try {
        BaseRecalibratorEngineSparkWrapper.saveTextualReport(outputTablesPath, readsHeader, table, bqsrArgs, auth);
    } catch (IOException e) {
        throw new UserException.CouldNotCreateOutputFile(new File(outputTablesPath), e);
    }
}
Also used : ContextShard(org.broadinstitute.hellbender.engine.ContextShard) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) IOException(java.io.IOException) RecalibrationTables(org.broadinstitute.hellbender.utils.recalibration.RecalibrationTables) StandardCovariateList(org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseRecalibratorEngineSparkWrapper(org.broadinstitute.hellbender.tools.spark.transforms.bqsr.BaseRecalibratorEngineSparkWrapper) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Example 2 with ContextShard

use of org.broadinstitute.hellbender.engine.ContextShard in project gatk by broadinstitute.

the class AddContextDataToReadSparkOptimized method fillVariants.

/**
     * Given a list of shards and a list of variants,
     * add each variant to every (shard+margin) that it overlaps.
     *
     * This happens immediately, at the caller.
     */
public static ArrayList<ContextShard> fillVariants(List<SimpleInterval> shardedIntervals, List<GATKVariant> variants, int margin) {
    IntervalsSkipList<GATKVariant> intervals = new IntervalsSkipList<>(variants);
    ArrayList<ContextShard> ret = new ArrayList<>();
    for (SimpleInterval s : shardedIntervals) {
        int start = Math.max(s.getStart() - margin, 1);
        int end = s.getEnd() + margin;
        // here it's OK if end is past the contig's boundary, there just won't be any variant there.
        SimpleInterval expandedInterval = new SimpleInterval(s.getContig(), start, end);
        // the next ContextShard has interval s because we want it to contain all reads that start in s.
        // We give it all variants that overlap the expanded interval in order to make sure we include
        // all the variants that overlap with the reads of interest.
        //
        // Graphically:
        // |------- s --------|
        //--------expandedInterval------------------|
        //            |-- a read starting in s --|
        //                           |--- a variant overlapping the read ---|
        //
        // Since the read's length is less than margin, we know that by including all the variants that overlap
        // with the expanded interval we are also including all the variants that overlap with all the reads in this shard.
        ret.add(new ContextShard(s).withVariants(intervals.getOverlapping(expandedInterval)));
    }
    return ret;
}
Also used : ContextShard(org.broadinstitute.hellbender.engine.ContextShard) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) IntervalsSkipList(org.broadinstitute.hellbender.utils.collections.IntervalsSkipList) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

ContextShard (org.broadinstitute.hellbender.engine.ContextShard)2 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)2 GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)2 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1 File (java.io.File)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)1 ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)1 ReadsSparkSource (org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource)1 UserException (org.broadinstitute.hellbender.exceptions.UserException)1 BaseRecalibratorEngineSparkWrapper (org.broadinstitute.hellbender.tools.spark.transforms.bqsr.BaseRecalibratorEngineSparkWrapper)1 IntervalsSkipList (org.broadinstitute.hellbender.utils.collections.IntervalsSkipList)1 RecalibrationTables (org.broadinstitute.hellbender.utils.recalibration.RecalibrationTables)1 StandardCovariateList (org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList)1