use of org.broadinstitute.hellbender.engine.ContextShard in project gatk by broadinstitute.
the class BaseRecalibratorSparkSharded method runPipeline.
@Override
protected void runPipeline(JavaSparkContext ctx) {
if (readArguments.getReadFilesNames().size() != 1) {
throw new UserException("Sorry, we only support a single reads input for now.");
}
final String bam = readArguments.getReadFilesNames().get(0);
final String referenceURL = referenceArguments.getReferenceFileName();
auth = getAuthHolder();
final ReferenceMultiSource rds = new ReferenceMultiSource(auth, referenceURL, BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION);
SAMFileHeader readsHeader = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency()).getHeader(bam, referenceURL);
final SAMSequenceDictionary readsDictionary = readsHeader.getSequenceDictionary();
final SAMSequenceDictionary refDictionary = rds.getReferenceSequenceDictionary(readsDictionary);
final ReadFilter readFilterToApply = ReadFilter.fromList(BaseRecalibrator.getStandardBQSRReadFilterList(), readsHeader);
SequenceDictionaryUtils.validateDictionaries("reference", refDictionary, "reads", readsDictionary);
Broadcast<SAMFileHeader> readsHeaderBcast = ctx.broadcast(readsHeader);
Broadcast<SAMSequenceDictionary> refDictionaryBcast = ctx.broadcast(refDictionary);
List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary()) : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary());
List<String> localVariants = knownVariants;
localVariants = hackilyCopyFromGCSIfNecessary(localVariants);
List<GATKVariant> variants = VariantsSource.getVariantsList(localVariants);
// get reads, reference, variants
JavaRDD<ContextShard> readsWithContext = AddContextDataToReadSparkOptimized.add(ctx, intervals, bam, variants, readFilterToApply, rds);
// run BaseRecalibratorEngine.
BaseRecalibratorEngineSparkWrapper recal = new BaseRecalibratorEngineSparkWrapper(readsHeaderBcast, refDictionaryBcast, bqsrArgs);
JavaRDD<RecalibrationTables> tables = readsWithContext.mapPartitions(s -> recal.apply(s));
final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(bqsrArgs, readsHeader));
final RecalibrationTables table = tables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(tables.partitions().size()) / Math.log(2))));
BaseRecalibrationEngine.finalizeRecalibrationTables(table);
try {
BaseRecalibratorEngineSparkWrapper.saveTextualReport(outputTablesPath, readsHeader, table, bqsrArgs, auth);
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile(new File(outputTablesPath), e);
}
}
use of org.broadinstitute.hellbender.engine.ContextShard in project gatk by broadinstitute.
the class AddContextDataToReadSparkOptimized method fillVariants.
/**
* Given a list of shards and a list of variants,
* add each variant to every (shard+margin) that it overlaps.
*
* This happens immediately, at the caller.
*/
public static ArrayList<ContextShard> fillVariants(List<SimpleInterval> shardedIntervals, List<GATKVariant> variants, int margin) {
IntervalsSkipList<GATKVariant> intervals = new IntervalsSkipList<>(variants);
ArrayList<ContextShard> ret = new ArrayList<>();
for (SimpleInterval s : shardedIntervals) {
int start = Math.max(s.getStart() - margin, 1);
int end = s.getEnd() + margin;
// here it's OK if end is past the contig's boundary, there just won't be any variant there.
SimpleInterval expandedInterval = new SimpleInterval(s.getContig(), start, end);
// the next ContextShard has interval s because we want it to contain all reads that start in s.
// We give it all variants that overlap the expanded interval in order to make sure we include
// all the variants that overlap with the reads of interest.
//
// Graphically:
// |------- s --------|
//--------expandedInterval------------------|
// |-- a read starting in s --|
// |--- a variant overlapping the read ---|
//
// Since the read's length is less than margin, we know that by including all the variants that overlap
// with the expanded interval we are also including all the variants that overlap with all the reads in this shard.
ret.add(new ContextShard(s).withVariants(intervals.getOverlapping(expandedInterval)));
}
return ret;
}
Aggregations