use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.
the class JoinReadsWithVariantsSparkUnitTest method pairReadsAndVariantsTest.
@Test(dataProvider = "pairedReadsAndVariants", groups = "spark")
public void pairReadsAndVariantsTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, Iterable<GATKVariant>>> kvReadiVariant, JoinStrategy joinStrategy) {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
JavaPairRDD<GATKRead, Iterable<GATKVariant>> actual = joinStrategy == JoinStrategy.SHUFFLE ? ShuffleJoinReadsWithVariants.join(rddReads, rddVariants) : BroadcastJoinReadsWithVariants.join(rddReads, rddVariants);
Map<GATKRead, Iterable<GATKVariant>> gatkReadIterableMap = actual.collectAsMap();
Assert.assertEquals(gatkReadIterableMap.size(), kvReadiVariant.size());
for (KV<GATKRead, Iterable<GATKVariant>> kv : kvReadiVariant) {
List<GATKVariant> variants = Lists.newArrayList(gatkReadIterableMap.get(kv.getKey()));
Assert.assertTrue(variants.stream().noneMatch(v -> v == null));
HashSet<GATKVariant> hashVariants = new LinkedHashSet<>(variants);
final Iterable<GATKVariant> iVariants = kv.getValue();
HashSet<GATKVariant> expectedHashVariants = Sets.newLinkedHashSet(iVariants);
Assert.assertEquals(hashVariants, expectedHashVariants);
}
}
use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.
the class VariantsSparkSourceUnitTest method pairReadsAndVariantsTest.
@Test(dataProvider = "loadVariants", groups = "spark")
public void pairReadsAndVariantsTest(String vcf) {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
JavaRDD<GATKVariant> rddParallelVariants = variantsSparkSource.getParallelVariants(vcf, null);
List<GATKVariant> serialVariants = getSerialVariants(vcf);
List<GATKVariant> parallelVariants = rddParallelVariants.collect();
Assert.assertEquals(parallelVariants, serialVariants);
}
use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.
the class VariantsSparkSourceUnitTest method getMultipleParallelVCFsTest.
@Test(dataProvider = "loadMultipleVCFs", groups = "spark")
public void getMultipleParallelVCFsTest(List<String> vcfList) {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
JavaRDD<GATKVariant> rddParallelVariants = variantsSparkSource.getParallelVariants(vcfList, null);
// retrieve the same set of variants, but through VariantsSource, and wrapped by
// the same wrapper class used by VariantsSparkSource to facilitate comparison
List<GATKVariant> variantsList = VariantsSource.getVariantsListAs(vcfList, vc -> VariantContextVariantAdapter.sparkVariantAdapter(vc));
Assert.assertTrue(CollectionUtils.isEqualCollection(rddParallelVariants.collect(), variantsList));
}
use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.
the class BQSRPipelineSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
throw new UserException.Require2BitReferenceForBroadcast();
}
//Should this get the getUnfilteredReads? getReads will merge default and command line filters.
//but the code below uses other filters for other parts of the pipeline that do not honor
//the commandline.
final JavaRDD<GATKRead> initialReads = getReads();
// The initial reads have already had the WellformedReadFilter applied to them, which
// is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
// performed, so we do that here.
//NOTE: this filter doesn't honor enabled/disabled commandline filters
final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
final JavaRDD<GATKRead> filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));
final VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
final JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
final JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, filteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
//note: we use the reference dictionary from the reads themselves.
final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getHeaderForReads().getSequenceDictionary(), bqsrArgs);
final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
writeReads(ctx, output, finalReads);
}
use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.
the class AddContextDataToReadSpark method addUsingOverlapsPartitioning.
/**
* Add context data ({@link ReadContextData}) to reads, using overlaps partitioning to avoid a shuffle.
* @param ctx the Spark context
* @param mappedReads the coordinate-sorted reads
* @param referenceSource the reference source
* @param variants the coordinate-sorted variants
* @param sequenceDictionary the sequence dictionary for the reads
* @param shardSize the maximum size of each shard, in bases
* @param shardPadding amount of extra context around each shard, in bases
* @return a RDD of read-context pairs, in coordinate-sorted order
*/
private static JavaPairRDD<GATKRead, ReadContextData> addUsingOverlapsPartitioning(final JavaSparkContext ctx, final JavaRDD<GATKRead> mappedReads, final ReferenceMultiSource referenceSource, final JavaRDD<GATKVariant> variants, final SAMSequenceDictionary sequenceDictionary, final int shardSize, final int shardPadding) {
final List<SimpleInterval> intervals = IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
// use unpadded shards (padding is only needed for reference bases)
final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
final Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceSource);
final IntervalsSkipList<GATKVariant> variantSkipList = new IntervalsSkipList<>(variants.collect());
final Broadcast<IntervalsSkipList<GATKVariant>> variantsBroadcast = ctx.broadcast(variantSkipList);
int maxLocatableSize = Math.min(shardSize, shardPadding);
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, mappedReads, GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize);
return shardedReads.flatMapToPair(new PairFlatMapFunction<Shard<GATKRead>, GATKRead, ReadContextData>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<Tuple2<GATKRead, ReadContextData>> call(Shard<GATKRead> shard) throws Exception {
// get reference bases for this shard (padded)
SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(shardPadding, sequenceDictionary);
ReferenceBases referenceBases = bReferenceSource.getValue().getReferenceBases(null, paddedInterval);
final IntervalsSkipList<GATKVariant> intervalsSkipList = variantsBroadcast.getValue();
Iterator<Tuple2<GATKRead, ReadContextData>> transform = Iterators.transform(shard.iterator(), new Function<GATKRead, Tuple2<GATKRead, ReadContextData>>() {
@Nullable
@Override
public Tuple2<GATKRead, ReadContextData> apply(@Nullable GATKRead r) {
List<GATKVariant> overlappingVariants;
if (SimpleInterval.isValid(r.getContig(), r.getStart(), r.getEnd())) {
overlappingVariants = intervalsSkipList.getOverlapping(new SimpleInterval(r));
} else {
//Sometimes we have reads that do not form valid intervals (reads that do not consume any ref bases, eg CIGAR 61S90I
//In those cases, we'll just say that nothing overlaps the read
overlappingVariants = Collections.emptyList();
}
return new Tuple2<>(r, new ReadContextData(referenceBases, overlappingVariants));
}
});
// only include reads that start in the shard
return Iterators.filter(transform, r -> r._1().getStart() >= shard.getStart() && r._1().getStart() <= shard.getEnd());
}
});
}
Aggregations