Search in sources :

Example 11 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class JoinReadsWithVariantsSparkUnitTest method pairReadsAndVariantsTest.

@Test(dataProvider = "pairedReadsAndVariants", groups = "spark")
public void pairReadsAndVariantsTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, Iterable<GATKVariant>>> kvReadiVariant, JoinStrategy joinStrategy) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
    JavaPairRDD<GATKRead, Iterable<GATKVariant>> actual = joinStrategy == JoinStrategy.SHUFFLE ? ShuffleJoinReadsWithVariants.join(rddReads, rddVariants) : BroadcastJoinReadsWithVariants.join(rddReads, rddVariants);
    Map<GATKRead, Iterable<GATKVariant>> gatkReadIterableMap = actual.collectAsMap();
    Assert.assertEquals(gatkReadIterableMap.size(), kvReadiVariant.size());
    for (KV<GATKRead, Iterable<GATKVariant>> kv : kvReadiVariant) {
        List<GATKVariant> variants = Lists.newArrayList(gatkReadIterableMap.get(kv.getKey()));
        Assert.assertTrue(variants.stream().noneMatch(v -> v == null));
        HashSet<GATKVariant> hashVariants = new LinkedHashSet<>(variants);
        final Iterable<GATKVariant> iVariants = kv.getValue();
        HashSet<GATKVariant> expectedHashVariants = Sets.newLinkedHashSet(iVariants);
        Assert.assertEquals(hashVariants, expectedHashVariants);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) java.util(java.util) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) KV(com.google.cloud.dataflow.sdk.values.KV) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.testng.annotations.Test) Read(com.google.api.services.genomics.model.Read) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Sets(com.google.common.collect.Sets) SAMRecord(htsjdk.samtools.SAMRecord) Lists(com.google.common.collect.Lists) Assert(org.testng.Assert) JavaRDD(org.apache.spark.api.java.JavaRDD) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 12 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class VariantsSparkSourceUnitTest method pairReadsAndVariantsTest.

@Test(dataProvider = "loadVariants", groups = "spark")
public void pairReadsAndVariantsTest(String vcf) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> rddParallelVariants = variantsSparkSource.getParallelVariants(vcf, null);
    List<GATKVariant> serialVariants = getSerialVariants(vcf);
    List<GATKVariant> parallelVariants = rddParallelVariants.collect();
    Assert.assertEquals(parallelVariants, serialVariants);
}
Also used : GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 13 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class VariantsSparkSourceUnitTest method getMultipleParallelVCFsTest.

@Test(dataProvider = "loadMultipleVCFs", groups = "spark")
public void getMultipleParallelVCFsTest(List<String> vcfList) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> rddParallelVariants = variantsSparkSource.getParallelVariants(vcfList, null);
    // retrieve the same set of variants, but through VariantsSource, and wrapped by
    // the same wrapper class used by VariantsSparkSource to facilitate comparison
    List<GATKVariant> variantsList = VariantsSource.getVariantsListAs(vcfList, vc -> VariantContextVariantAdapter.sparkVariantAdapter(vc));
    Assert.assertTrue(CollectionUtils.isEqualCollection(rddParallelVariants.collect(), variantsList));
}
Also used : GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 14 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class BQSRPipelineSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    //Should this get the getUnfilteredReads? getReads will merge default and command line filters.
    //but the code below uses other filters for other parts of the pipeline that do not honor
    //the commandline.
    final JavaRDD<GATKRead> initialReads = getReads();
    // The initial reads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
    // performed, so we do that here.
    //NOTE: this filter doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));
    final VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    final JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
    final JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, filteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    //note: we use the reference dictionary from the reads themselves.
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getHeaderForReads().getSequenceDictionary(), bqsrArgs);
    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
    writeReads(ctx, output, finalReads);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 15 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class AddContextDataToReadSpark method addUsingOverlapsPartitioning.

/**
     * Add context data ({@link ReadContextData}) to reads, using overlaps partitioning to avoid a shuffle.
     * @param ctx the Spark context
     * @param mappedReads the coordinate-sorted reads
     * @param referenceSource the reference source
     * @param variants the coordinate-sorted variants
     * @param sequenceDictionary the sequence dictionary for the reads
     * @param shardSize the maximum size of each shard, in bases
     * @param shardPadding amount of extra context around each shard, in bases
     * @return a RDD of read-context pairs, in coordinate-sorted order
     */
private static JavaPairRDD<GATKRead, ReadContextData> addUsingOverlapsPartitioning(final JavaSparkContext ctx, final JavaRDD<GATKRead> mappedReads, final ReferenceMultiSource referenceSource, final JavaRDD<GATKVariant> variants, final SAMSequenceDictionary sequenceDictionary, final int shardSize, final int shardPadding) {
    final List<SimpleInterval> intervals = IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    // use unpadded shards (padding is only needed for reference bases)
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
    final Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceSource);
    final IntervalsSkipList<GATKVariant> variantSkipList = new IntervalsSkipList<>(variants.collect());
    final Broadcast<IntervalsSkipList<GATKVariant>> variantsBroadcast = ctx.broadcast(variantSkipList);
    int maxLocatableSize = Math.min(shardSize, shardPadding);
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, mappedReads, GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize);
    return shardedReads.flatMapToPair(new PairFlatMapFunction<Shard<GATKRead>, GATKRead, ReadContextData>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Iterator<Tuple2<GATKRead, ReadContextData>> call(Shard<GATKRead> shard) throws Exception {
            // get reference bases for this shard (padded)
            SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(shardPadding, sequenceDictionary);
            ReferenceBases referenceBases = bReferenceSource.getValue().getReferenceBases(null, paddedInterval);
            final IntervalsSkipList<GATKVariant> intervalsSkipList = variantsBroadcast.getValue();
            Iterator<Tuple2<GATKRead, ReadContextData>> transform = Iterators.transform(shard.iterator(), new Function<GATKRead, Tuple2<GATKRead, ReadContextData>>() {

                @Nullable
                @Override
                public Tuple2<GATKRead, ReadContextData> apply(@Nullable GATKRead r) {
                    List<GATKVariant> overlappingVariants;
                    if (SimpleInterval.isValid(r.getContig(), r.getStart(), r.getEnd())) {
                        overlappingVariants = intervalsSkipList.getOverlapping(new SimpleInterval(r));
                    } else {
                        //Sometimes we have reads that do not form valid intervals (reads that do not consume any ref bases, eg CIGAR 61S90I
                        //In those cases, we'll just say that nothing overlaps the read
                        overlappingVariants = Collections.emptyList();
                    }
                    return new Tuple2<>(r, new ReadContextData(referenceBases, overlappingVariants));
                }
            });
            // only include reads that start in the shard
            return Iterators.filter(transform, r -> r._1().getStart() >= shard.getStart() && r._1().getStart() <= shard.getEnd());
        }
    });
}
Also used : PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Iterators(com.google.common.collect.Iterators) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) JavaRDD(org.apache.spark.api.java.JavaRDD) Nullable(javax.annotation.Nullable) Broadcast(org.apache.spark.broadcast.Broadcast) IntervalsSkipList(org.broadinstitute.hellbender.utils.collections.IntervalsSkipList) Function(com.google.common.base.Function) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) Shard(org.broadinstitute.hellbender.engine.Shard) List(java.util.List) UserException(org.broadinstitute.hellbender.exceptions.UserException) ShardBoundary(org.broadinstitute.hellbender.engine.ShardBoundary) Collections(java.util.Collections) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ShardBoundary(org.broadinstitute.hellbender.engine.ShardBoundary) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Function(com.google.common.base.Function) IntervalsSkipList(org.broadinstitute.hellbender.utils.collections.IntervalsSkipList) Iterator(java.util.Iterator) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) Shard(org.broadinstitute.hellbender.engine.Shard) Nullable(javax.annotation.Nullable)

Aggregations

GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)16 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)10 ReadContextData (org.broadinstitute.hellbender.engine.ReadContextData)7 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)7 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)5 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)4 Test (org.testng.annotations.Test)4 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)3 ArrayList (java.util.ArrayList)3 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)3 ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)3 VariantsSparkSource (org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource)3 IntervalsSkipList (org.broadinstitute.hellbender.utils.collections.IntervalsSkipList)3 RecalibrationReport (org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)3 Tuple2 (scala.Tuple2)3 IOException (java.io.IOException)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 ContextShard (org.broadinstitute.hellbender.engine.ContextShard)2