Search in sources :

Example 1 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class FindBadGenomicKmersSpark method runTool.

/** Get the list of high copy number kmers in the reference, and write them to a file. */
@Override
protected void runTool(final JavaSparkContext ctx) {
    final SAMFileHeader hdr = getHeaderForReads();
    SAMSequenceDictionary dict = null;
    if (hdr != null)
        dict = hdr.getSequenceDictionary();
    final PipelineOptions options = getAuthenticatedGCSOptions();
    final ReferenceMultiSource referenceMultiSource = getReference();
    Collection<SVKmer> killList = findBadGenomicKmers(ctx, kSize, maxDUSTScore, referenceMultiSource, options, dict);
    if (highCopyFastaFilename != null) {
        killList = uniquify(killList, processFasta(kSize, maxDUSTScore, highCopyFastaFilename, options));
    }
    SVUtils.writeKmersFile(kSize, outputFile, killList);
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 2 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class SVVCFWriter method writeVCF.

/**
     * FASTA and Broadcast references are both required because 2bit Broadcast references currently order their
     * sequence dictionaries in a scrambled order, see https://github.com/broadinstitute/gatk/issues/2037.
     */
public static void writeVCF(final PipelineOptions pipelineOptions, final String vcfFileName, final String fastaReference, final JavaRDD<VariantContext> variantContexts, final Logger logger) {
    final SAMSequenceDictionary referenceSequenceDictionary = new ReferenceMultiSource(pipelineOptions, fastaReference, ReferenceWindowFunctions.IDENTITY_FUNCTION).getReferenceSequenceDictionary(null);
    final List<VariantContext> sortedVariantsList = sortVariantsByCoordinate(variantContexts.collect(), referenceSequenceDictionary);
    logNumOfVarByTypes(sortedVariantsList, logger);
    writeVariants(pipelineOptions, vcfFileName, sortedVariantsList, referenceSequenceDictionary);
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 3 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class HaplotypeCallerSparkIntegrationTest method testReferenceAdapterIsSerializable.

@Test
public void testReferenceAdapterIsSerializable() throws IOException {
    final AuthHolder auth = new AuthHolder("name", "somestring");
    final ReferenceMultiSource referenceMultiSource = new ReferenceMultiSource(auth, b37_2bit_reference_20_21, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SparkTestUtils.roundTripInKryo(referenceMultiSource, ReferenceMultiSource.class, SparkContextFactory.getTestSparkContext().getConf());
    final HaplotypeCallerSpark.ReferenceMultiSourceAdapter adapter = new HaplotypeCallerSpark.ReferenceMultiSourceAdapter(referenceMultiSource, auth);
    SparkTestUtils.roundTripInKryo(adapter, HaplotypeCallerSpark.ReferenceMultiSourceAdapter.class, SparkContextFactory.getTestSparkContext().getConf());
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) AuthHolder(org.broadinstitute.hellbender.engine.AuthHolder) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) HaplotypeCallerIntegrationTest(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerIntegrationTest)

Example 4 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class HaplotypeCallerSparkIntegrationTest method testReferenceMultiSourceIsSerializable.

@Test
public void testReferenceMultiSourceIsSerializable() {
    final ReferenceMultiSource args = new ReferenceMultiSource((PipelineOptions) null, BaseTest.b37_2bit_reference_20_21, ReferenceWindowFunctions.IDENTITY_FUNCTION);
    SparkTestUtils.roundTripInKryo(args, ReferenceMultiSource.class, SparkContextFactory.getTestSparkContext().getConf());
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test) CommandLineProgramTest(org.broadinstitute.hellbender.CommandLineProgramTest) HaplotypeCallerIntegrationTest(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerIntegrationTest)

Example 5 with ReferenceMultiSource

use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.

the class ReadWalkerSpark method getReads.

/**
     * Loads reads and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
     *
     * If no intervals were specified, returns all the reads.
     *
     * @return all reads as a {@link JavaRDD}, bounded by intervals if specified.
     */
public JavaRDD<ReadWalkerContext> getReads(JavaSparkContext ctx) {
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    // use unpadded shards (padding is only needed for reference bases)
    final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, readShardSize, shuffle);
    Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedReads.flatMap(getReadsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, readShardPadding));
}
Also used : Broadcast(org.apache.spark.broadcast.Broadcast) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) StreamSupport(java.util.stream.StreamSupport) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Aggregations

ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)30 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)18 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)15 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)12 Collectors (java.util.stream.Collectors)11 JavaRDD (org.apache.spark.api.java.JavaRDD)11 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)11 StreamSupport (java.util.stream.StreamSupport)10 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)10 Test (org.testng.annotations.Test)10 List (java.util.List)9 Broadcast (org.apache.spark.broadcast.Broadcast)9 Argument (org.broadinstitute.barclay.argparser.Argument)8 org.broadinstitute.hellbender.engine (org.broadinstitute.hellbender.engine)8 PipelineOptions (com.google.cloud.dataflow.sdk.options.PipelineOptions)7 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)7 SAMFileHeader (htsjdk.samtools.SAMFileHeader)6 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)6 Tuple2 (scala.Tuple2)5