use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method runTool.
/** Get the list of high copy number kmers in the reference, and write them to a file. */
@Override
protected void runTool(final JavaSparkContext ctx) {
final SAMFileHeader hdr = getHeaderForReads();
SAMSequenceDictionary dict = null;
if (hdr != null)
dict = hdr.getSequenceDictionary();
final PipelineOptions options = getAuthenticatedGCSOptions();
final ReferenceMultiSource referenceMultiSource = getReference();
Collection<SVKmer> killList = findBadGenomicKmers(ctx, kSize, maxDUSTScore, referenceMultiSource, options, dict);
if (highCopyFastaFilename != null) {
killList = uniquify(killList, processFasta(kSize, maxDUSTScore, highCopyFastaFilename, options));
}
SVUtils.writeKmersFile(kSize, outputFile, killList);
}
use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.
the class SVVCFWriter method writeVCF.
/**
* FASTA and Broadcast references are both required because 2bit Broadcast references currently order their
* sequence dictionaries in a scrambled order, see https://github.com/broadinstitute/gatk/issues/2037.
*/
public static void writeVCF(final PipelineOptions pipelineOptions, final String vcfFileName, final String fastaReference, final JavaRDD<VariantContext> variantContexts, final Logger logger) {
final SAMSequenceDictionary referenceSequenceDictionary = new ReferenceMultiSource(pipelineOptions, fastaReference, ReferenceWindowFunctions.IDENTITY_FUNCTION).getReferenceSequenceDictionary(null);
final List<VariantContext> sortedVariantsList = sortVariantsByCoordinate(variantContexts.collect(), referenceSequenceDictionary);
logNumOfVarByTypes(sortedVariantsList, logger);
writeVariants(pipelineOptions, vcfFileName, sortedVariantsList, referenceSequenceDictionary);
}
use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.
the class HaplotypeCallerSparkIntegrationTest method testReferenceAdapterIsSerializable.
@Test
public void testReferenceAdapterIsSerializable() throws IOException {
final AuthHolder auth = new AuthHolder("name", "somestring");
final ReferenceMultiSource referenceMultiSource = new ReferenceMultiSource(auth, b37_2bit_reference_20_21, ReferenceWindowFunctions.IDENTITY_FUNCTION);
SparkTestUtils.roundTripInKryo(referenceMultiSource, ReferenceMultiSource.class, SparkContextFactory.getTestSparkContext().getConf());
final HaplotypeCallerSpark.ReferenceMultiSourceAdapter adapter = new HaplotypeCallerSpark.ReferenceMultiSourceAdapter(referenceMultiSource, auth);
SparkTestUtils.roundTripInKryo(adapter, HaplotypeCallerSpark.ReferenceMultiSourceAdapter.class, SparkContextFactory.getTestSparkContext().getConf());
}
use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.
the class HaplotypeCallerSparkIntegrationTest method testReferenceMultiSourceIsSerializable.
@Test
public void testReferenceMultiSourceIsSerializable() {
final ReferenceMultiSource args = new ReferenceMultiSource((PipelineOptions) null, BaseTest.b37_2bit_reference_20_21, ReferenceWindowFunctions.IDENTITY_FUNCTION);
SparkTestUtils.roundTripInKryo(args, ReferenceMultiSource.class, SparkContextFactory.getTestSparkContext().getConf());
}
use of org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource in project gatk by broadinstitute.
the class ReadWalkerSpark method getReads.
/**
* Loads reads and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
*
* If no intervals were specified, returns all the reads.
*
* @return all reads as a {@link JavaRDD}, bounded by intervals if specified.
*/
public JavaRDD<ReadWalkerContext> getReads(JavaSparkContext ctx) {
SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
List<SimpleInterval> intervals = hasIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
// use unpadded shards (padding is only needed for reference bases)
final List<ShardBoundary> intervalShards = intervals.stream().flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, 0, sequenceDictionary).stream()).collect(Collectors.toList());
JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, readShardSize, shuffle);
Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
return shardedReads.flatMap(getReadsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, readShardPadding));
}
Aggregations