Search in sources :

Example 11 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class PathSeqFilterSpark method doHostBWA.

private JavaRDD<GATKRead> doHostBWA(final JavaSparkContext ctx, final SAMFileHeader readsHeader, final JavaRDD<GATKRead> reads) {
    final BwaSparkEngine engine = new BwaSparkEngine(ctx, indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary());
    // null if we have no api key
    final GCSOptions gcsOptions = getAuthenticatedGCSOptions();
    final ReferenceMultiSource hostReference = new ReferenceMultiSource(gcsOptions, HOST_REF_PATH, getReferenceWindowFunction());
    final SAMSequenceDictionary hostRefDict = hostReference.getReferenceSequenceDictionary(header.getSequenceDictionary());
    readsHeader.setSequenceDictionary(hostRefDict);
    return engine.align(reads);
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) BwaSparkEngine(org.broadinstitute.hellbender.tools.spark.bwa.BwaSparkEngine) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) GCSOptions(com.google.cloud.genomics.dataflow.utils.GCSOptions)

Example 12 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk-protected by broadinstitute.

the class HaplotypeCallerSparkIntegrationTest method testReferenceAdapterIsSerializable.

@Test
public void testReferenceAdapterIsSerializable() throws IOException {
    final AuthHolder auth = new AuthHolder("name", "somestring");
    final ReferenceMultiSource referenceMultiSource = new ReferenceMultiSource(auth, b37_2bit_reference_20_21, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SparkTestUtils.roundTripInKryo(referenceMultiSource, ReferenceMultiSource.class, SparkContextFactory.getTestSparkContext().getConf());
    final HaplotypeCallerSpark.ReferenceMultiSourceAdapter adapter = new HaplotypeCallerSpark.ReferenceMultiSourceAdapter(referenceMultiSource, auth);
    SparkTestUtils.roundTripInKryo(adapter, HaplotypeCallerSpark.ReferenceMultiSourceAdapter.class, SparkContextFactory.getTestSparkContext().getConf());
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) AuthHolder(org.broadinstitute.hellbender.engine.AuthHolder) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) HaplotypeCallerIntegrationTest(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerIntegrationTest)

Example 13 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class GATKSparkTool method initializeReference.

/**
     * Initializes our reference source. Does nothing if no reference was specified.
     */
private void initializeReference() {
    // null if we have no api key
    final GCSOptions gcsOptions = getAuthenticatedGCSOptions();
    final String referenceURL = referenceArguments.getReferenceFileName();
    if (referenceURL != null) {
        referenceSource = new ReferenceMultiSource(gcsOptions, referenceURL, getReferenceWindowFunction());
        referenceDictionary = referenceSource.getReferenceSequenceDictionary(readsHeader != null ? readsHeader.getSequenceDictionary() : null);
        if (referenceDictionary == null) {
            throw new UserException.MissingReferenceDictFile(referenceURL);
        }
    }
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) GCSOptions(com.google.cloud.genomics.dataflow.utils.GCSOptions)

Example 14 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class LocusWalkerSpark method getAlignments.

/**
     * Loads alignments and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
     *
     * If no intervals were specified, returns all the alignments.
     *
     * @return all alignments as a {@link JavaRDD}, bounded by intervals if specified.
     */
public JavaRDD<LocusWalkerContext> getAlignments(JavaSparkContext ctx) {
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, readShardPadding, sequenceDictionary).stream()).collect(Collectors.toList());
    int maxLocatableSize = Math.min(readShardSize, readShardPadding);
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, maxLocatableSize, shuffle);
    Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedReads.flatMap(getAlignmentsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, getHeaderForReads(), getDownsamplingInfo()));
}
Also used : Broadcast(org.apache.spark.broadcast.Broadcast) java.util(java.util) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ImmutableList(com.google.common.collect.ImmutableList) StreamSupport(java.util.stream.StreamSupport) LIBSDownsamplingInfo(org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 15 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class VariantWalkerSpark method getVariantsFunction.

private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(final Broadcast<ReferenceMultiSource> bReferenceSource, final Broadcast<FeatureManager> bFeatureManager, final SAMSequenceDictionary sequenceDictionary, final int variantShardPadding) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(variantShardPadding, sequenceDictionary);
        ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return StreamSupport.stream(shard.spliterator(), false).filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()).map(v -> {
            final SimpleInterval variantInterval = new SimpleInterval(v);
            return new VariantWalkerContext(v, new ReadsContext(), new ReferenceContext(reference, variantInterval), new FeatureContext(features, variantInterval));
        }).iterator();
    };
}
Also used : Broadcast(org.apache.spark.broadcast.Broadcast) VCFHeader(htsjdk.variant.vcf.VCFHeader) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) IndexUtils(org.broadinstitute.hellbender.utils.IndexUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) VariantFilterLibrary(org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) VariantFilter(org.broadinstitute.hellbender.engine.filters.VariantFilter) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) StreamSupport(java.util.stream.StreamSupport) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) VariantContext(htsjdk.variant.variantcontext.VariantContext) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)30 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)15 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)12 Collectors (java.util.stream.Collectors)11 JavaRDD (org.apache.spark.api.java.JavaRDD)11 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)11 StreamSupport (java.util.stream.StreamSupport)10 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)10 Test (org.testng.annotations.Test)10 List (java.util.List)9 Broadcast (org.apache.spark.broadcast.Broadcast)9 Argument (org.broadinstitute.barclay.argparser.Argument)8 org.broadinstitute.hellbender.engine (org.broadinstitute.hellbender.engine)8 PipelineOptions (com.google.cloud.dataflow.sdk.options.PipelineOptions)7 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)7 SAMFileHeader (htsjdk.samtools.SAMFileHeader)6 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)6 Tuple2 (scala.Tuple2)5