use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.
the class BaseRecalibratorSpark method runTool.
@Override
protected void runTool(JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
throw new UserException.Require2BitReferenceForBroadcast();
}
JavaRDD<GATKRead> initialReads = getReads();
VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(knownVariants, getIntervals());
// TODO: Look into broadcasting the reference to all of the workers. This would make AddContextDataToReadSpark
// TODO: and ApplyBQSRStub simpler (#855).
JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, initialReads, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
// TODO: broadcast the reads header?
final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
try (final PrintStream reportStream = new PrintStream(BucketUtils.createFile(outputTablesPath))) {
RecalUtils.outputRecalibrationReport(reportStream, bqsrArgs, bqsrReport.getQuantizationInfo(), bqsrReport.getRecalibrationTables(), bqsrReport.getCovariates());
}
}
use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.
the class ReadsPipelineSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
throw new UserException.Require2BitReferenceForBroadcast();
}
//TOOO: should this use getUnfilteredReads? getReads will apply default and command line filters
final JavaRDD<GATKRead> initialReads = getReads();
final JavaRDD<GATKRead> markedReadsWithOD = MarkDuplicatesSpark.mark(initialReads, getHeaderForReads(), duplicatesScoringStrategy, new OpticalDuplicateFinder(), getRecommendedNumReducers());
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.cleanupTemporaryAttributes(markedReadsWithOD);
// The markedReads have already had the WellformedReadFilter applied to them, which
// is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
// filtering performed, so we do that here.
//NOTE: this doesn't honor enabled/disabled commandline filters
final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
final JavaRDD<GATKRead> markedFilteredReadsForBQSR = markedReads.filter(read -> bqsrReadFilter.test(read));
VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, markedFilteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(markedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
writeReads(ctx, output, finalReads);
}
use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.
the class BaseRecalibratorSparkFn method apply.
public static RecalibrationReport apply(final JavaPairRDD<GATKRead, ReadContextData> readsWithContext, final SAMFileHeader header, final SAMSequenceDictionary referenceDictionary, final RecalibrationArgumentCollection recalArgs) {
JavaRDD<RecalibrationTables> unmergedTables = readsWithContext.mapPartitions(readWithContextIterator -> {
final BaseRecalibrationEngine bqsr = new BaseRecalibrationEngine(recalArgs, header);
bqsr.logCovariatesUsed();
while (readWithContextIterator.hasNext()) {
final Tuple2<GATKRead, ReadContextData> readWithData = readWithContextIterator.next();
Iterable<GATKVariant> variants = readWithData._2().getOverlappingVariants();
final ReferenceBases refBases = readWithData._2().getOverlappingReferenceBases();
ReferenceDataSource refDS = new ReferenceMemorySource(refBases, referenceDictionary);
bqsr.processRead(readWithData._1(), refDS, variants);
}
return Arrays.asList(bqsr.getRecalibrationTables()).iterator();
});
final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(recalArgs, header));
final RecalibrationTables combinedTables = unmergedTables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(unmergedTables.partitions().size()) / Math.log(2))));
BaseRecalibrationEngine.finalizeRecalibrationTables(combinedTables);
final QuantizationInfo quantizationInfo = new QuantizationInfo(combinedTables, recalArgs.QUANTIZING_LEVELS);
final StandardCovariateList covariates = new StandardCovariateList(recalArgs, header);
return RecalUtils.createRecalibrationReport(recalArgs.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), RecalUtils.generateReportTables(combinedTables, covariates));
}
use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.
the class AddContextDataToReadSparkUnitTest method addContextDataTest.
@Test(dataProvider = "bases", groups = "spark")
public void addContextDataTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, ReadContextData>> expectedReadContextData, JoinStrategy joinStrategy) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
when(mockSource.getReferenceBases(any(PipelineOptions.class), any())).then(new ReferenceBasesAnswer());
when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
SAMSequenceDictionary sd = new SAMSequenceDictionary(Lists.newArrayList(new SAMSequenceRecord("1", 100000), new SAMSequenceRecord("2", 100000)));
when(mockSource.getReferenceSequenceDictionary(null)).thenReturn(sd);
JavaPairRDD<GATKRead, ReadContextData> rddActual = AddContextDataToReadSpark.add(ctx, rddReads, mockSource, rddVariants, joinStrategy, sd, 10000, 1000);
Map<GATKRead, ReadContextData> actual = rddActual.collectAsMap();
Assert.assertEquals(actual.size(), expectedReadContextData.size());
for (KV<GATKRead, ReadContextData> kv : expectedReadContextData) {
ReadContextData readContextData = actual.get(kv.getKey());
Assert.assertNotNull(readContextData);
Assert.assertTrue(CollectionUtils.isEqualCollection(Lists.newArrayList(readContextData.getOverlappingVariants()), Lists.newArrayList(kv.getValue().getOverlappingVariants())));
SimpleInterval minimalInterval = kv.getValue().getOverlappingReferenceBases().getInterval();
ReferenceBases subset = readContextData.getOverlappingReferenceBases().getSubset(minimalInterval);
Assert.assertEquals(subset, kv.getValue().getOverlappingReferenceBases());
}
}
use of org.broadinstitute.hellbender.engine.ReadContextData in project gatk by broadinstitute.
the class AddContextDataToReadSparkOptimized method fillContext.
/**
* Given a shard that has reads and variants, query Google Genomics' Reference server and get reference info
* (including an extra margin on either side), and fill that and the correct variants into readContext.
*/
public static ContextShard fillContext(ReferenceMultiSource refSource, ContextShard shard) {
if (null == shard)
return null;
// use the function to make sure we get the exact correct amount of reference bases
int start = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction = refSource.getReferenceWindowFunction();
for (GATKRead r : shard.reads) {
SimpleInterval readRefs = referenceWindowFunction.apply(r);
start = Math.min(readRefs.getStart(), start);
end = Math.max(readRefs.getEnd(), end);
}
if (start == Integer.MAX_VALUE) {
// there are no reads in this shard, so we're going to remove it
return null;
}
SimpleInterval refInterval = new SimpleInterval(shard.interval.getContig(), start, end);
ReferenceBases refBases;
try {
refBases = refSource.getReferenceBases(null, refInterval);
} catch (IOException x) {
throw new GATKException("Unable to read the reference");
}
ArrayList<ReadContextData> readContext = new ArrayList<>();
for (GATKRead r : shard.reads) {
SimpleInterval readInterval = new SimpleInterval(r);
List<GATKVariant> variantsOverlappingThisRead = shard.variantsOverlapping(readInterval);
// we pass all the bases. That's better because this way it's just a shared
// pointer instead of being an array copy. Downstream processing is fine with having
// extra bases (it expects a few, actually).
readContext.add(new ReadContextData(refBases, variantsOverlappingThisRead));
}
return shard.withReadContext(readContext);
}
Aggregations