Search in sources :

Example 1 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class BaseRecalibratorSparkFn method apply.

public static RecalibrationReport apply(final JavaPairRDD<GATKRead, ReadContextData> readsWithContext, final SAMFileHeader header, final SAMSequenceDictionary referenceDictionary, final RecalibrationArgumentCollection recalArgs) {
    JavaRDD<RecalibrationTables> unmergedTables = readsWithContext.mapPartitions(readWithContextIterator -> {
        final BaseRecalibrationEngine bqsr = new BaseRecalibrationEngine(recalArgs, header);
        bqsr.logCovariatesUsed();
        while (readWithContextIterator.hasNext()) {
            final Tuple2<GATKRead, ReadContextData> readWithData = readWithContextIterator.next();
            Iterable<GATKVariant> variants = readWithData._2().getOverlappingVariants();
            final ReferenceBases refBases = readWithData._2().getOverlappingReferenceBases();
            ReferenceDataSource refDS = new ReferenceMemorySource(refBases, referenceDictionary);
            bqsr.processRead(readWithData._1(), refDS, variants);
        }
        return Arrays.asList(bqsr.getRecalibrationTables()).iterator();
    });
    final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(recalArgs, header));
    final RecalibrationTables combinedTables = unmergedTables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(unmergedTables.partitions().size()) / Math.log(2))));
    BaseRecalibrationEngine.finalizeRecalibrationTables(combinedTables);
    final QuantizationInfo quantizationInfo = new QuantizationInfo(combinedTables, recalArgs.QUANTIZING_LEVELS);
    final StandardCovariateList covariates = new StandardCovariateList(recalArgs, header);
    return RecalUtils.createRecalibrationReport(recalArgs.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), RecalUtils.generateReportTables(combinedTables, covariates));
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceDataSource(org.broadinstitute.hellbender.engine.ReferenceDataSource) ReferenceMemorySource(org.broadinstitute.hellbender.engine.ReferenceMemorySource) StandardCovariateList(org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases)

Example 2 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class BaseRecalibratorEngineSparkWrapper method apply.

public Iterator<RecalibrationTables> apply(Iterator<ContextShard> shards) throws Exception {
    this.header = headerBcast.value();
    this.referenceSequenceDictionary = referenceSequenceDictionaryBcast.value();
    recalibrationEngine = new BaseRecalibrationEngine(recalArgs, header);
    while (shards.hasNext()) {
        ContextShard shard = shards.next();
        for (int i = 0; i < shard.reads.size(); i++) {
            final GATKRead read = shard.reads.get(i);
            // Reads are shipped without the header -- put it back in
            ReadUtils.restoreHeaderIfNecessary(read, header);
            final ReadContextData rc = shard.readContext.get(i);
            final Iterable<GATKVariant> variants = rc.getOverlappingVariants();
            final ReferenceBases refBases = rc.getOverlappingReferenceBases();
            final ReferenceDataSource refDS = new ReferenceMemorySource(refBases, referenceSequenceDictionary);
            recalibrationEngine.processRead(read, refDS, variants);
        }
    }
    ArrayList<RecalibrationTables> ret = new ArrayList<>();
    ret.add(recalibrationEngine.getRecalibrationTables());
    return ret.iterator();
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ArrayList(java.util.ArrayList) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases)

Example 3 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class AddContextDataToReadSparkOptimized method fillContext.

/**
     * Given a shard that has reads and variants, query Google Genomics' Reference server and get reference info
     * (including an extra margin on either side), and fill that and the correct variants into readContext.
     */
public static ContextShard fillContext(ReferenceMultiSource refSource, ContextShard shard) {
    if (null == shard)
        return null;
    // use the function to make sure we get the exact correct amount of reference bases
    int start = Integer.MAX_VALUE;
    int end = Integer.MIN_VALUE;
    SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction = refSource.getReferenceWindowFunction();
    for (GATKRead r : shard.reads) {
        SimpleInterval readRefs = referenceWindowFunction.apply(r);
        start = Math.min(readRefs.getStart(), start);
        end = Math.max(readRefs.getEnd(), end);
    }
    if (start == Integer.MAX_VALUE) {
        // there are no reads in this shard, so we're going to remove it
        return null;
    }
    SimpleInterval refInterval = new SimpleInterval(shard.interval.getContig(), start, end);
    ReferenceBases refBases;
    try {
        refBases = refSource.getReferenceBases(null, refInterval);
    } catch (IOException x) {
        throw new GATKException("Unable to read the reference");
    }
    ArrayList<ReadContextData> readContext = new ArrayList<>();
    for (GATKRead r : shard.reads) {
        SimpleInterval readInterval = new SimpleInterval(r);
        List<GATKVariant> variantsOverlappingThisRead = shard.variantsOverlapping(readInterval);
        // we pass all the bases. That's better because this way it's just a shared
        // pointer instead of being an array copy. Downstream processing is fine with having
        // extra bases (it expects a few, actually).
        readContext.add(new ReadContextData(refBases, variantsOverlappingThisRead));
    }
    return shard.withReadContext(readContext);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) IOException(java.io.IOException) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Example 4 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class AddContextDataToReadSparkOptimized method fillVariants.

/**
     * Given a list of shards and a list of variants,
     * add each variant to every (shard+margin) that it overlaps.
     *
     * This happens immediately, at the caller.
     */
public static ArrayList<ContextShard> fillVariants(List<SimpleInterval> shardedIntervals, List<GATKVariant> variants, int margin) {
    IntervalsSkipList<GATKVariant> intervals = new IntervalsSkipList<>(variants);
    ArrayList<ContextShard> ret = new ArrayList<>();
    for (SimpleInterval s : shardedIntervals) {
        int start = Math.max(s.getStart() - margin, 1);
        int end = s.getEnd() + margin;
        // here it's OK if end is past the contig's boundary, there just won't be any variant there.
        SimpleInterval expandedInterval = new SimpleInterval(s.getContig(), start, end);
        // the next ContextShard has interval s because we want it to contain all reads that start in s.
        // We give it all variants that overlap the expanded interval in order to make sure we include
        // all the variants that overlap with the reads of interest.
        //
        // Graphically:
        // |------- s --------|
        //--------expandedInterval------------------|
        //            |-- a read starting in s --|
        //                           |--- a variant overlapping the read ---|
        //
        // Since the read's length is less than margin, we know that by including all the variants that overlap
        // with the expanded interval we are also including all the variants that overlap with all the reads in this shard.
        ret.add(new ContextShard(s).withVariants(intervals.getOverlapping(expandedInterval)));
    }
    return ret;
}
Also used : ContextShard(org.broadinstitute.hellbender.engine.ContextShard) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) IntervalsSkipList(org.broadinstitute.hellbender.utils.collections.IntervalsSkipList) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 5 with GATKVariant

use of org.broadinstitute.hellbender.utils.variant.GATKVariant in project gatk by broadinstitute.

the class AddContextDataToReadSparkUnitTest method addContextDataTest.

@Test(dataProvider = "bases", groups = "spark")
public void addContextDataTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, ReadContextData>> expectedReadContextData, JoinStrategy joinStrategy) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    when(mockSource.getReferenceBases(any(PipelineOptions.class), any())).then(new ReferenceBasesAnswer());
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SAMSequenceDictionary sd = new SAMSequenceDictionary(Lists.newArrayList(new SAMSequenceRecord("1", 100000), new SAMSequenceRecord("2", 100000)));
    when(mockSource.getReferenceSequenceDictionary(null)).thenReturn(sd);
    JavaPairRDD<GATKRead, ReadContextData> rddActual = AddContextDataToReadSpark.add(ctx, rddReads, mockSource, rddVariants, joinStrategy, sd, 10000, 1000);
    Map<GATKRead, ReadContextData> actual = rddActual.collectAsMap();
    Assert.assertEquals(actual.size(), expectedReadContextData.size());
    for (KV<GATKRead, ReadContextData> kv : expectedReadContextData) {
        ReadContextData readContextData = actual.get(kv.getKey());
        Assert.assertNotNull(readContextData);
        Assert.assertTrue(CollectionUtils.isEqualCollection(Lists.newArrayList(readContextData.getOverlappingVariants()), Lists.newArrayList(kv.getValue().getOverlappingVariants())));
        SimpleInterval minimalInterval = kv.getValue().getOverlappingReferenceBases().getInterval();
        ReferenceBases subset = readContextData.getOverlappingReferenceBases().getSubset(minimalInterval);
        Assert.assertEquals(subset, kv.getValue().getOverlappingReferenceBases());
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

GATKVariant (org.broadinstitute.hellbender.utils.variant.GATKVariant)16 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)10 ReadContextData (org.broadinstitute.hellbender.engine.ReadContextData)7 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)7 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)5 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)4 Test (org.testng.annotations.Test)4 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)3 ArrayList (java.util.ArrayList)3 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)3 ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)3 VariantsSparkSource (org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource)3 IntervalsSkipList (org.broadinstitute.hellbender.utils.collections.IntervalsSkipList)3 RecalibrationReport (org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)3 Tuple2 (scala.Tuple2)3 IOException (java.io.IOException)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 ContextShard (org.broadinstitute.hellbender.engine.ContextShard)2