use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method runTool.
/** Get the list of high copy number kmers in the reference, and write them to a file. */
@Override
protected void runTool(final JavaSparkContext ctx) {
final SAMFileHeader hdr = getHeaderForReads();
SAMSequenceDictionary dict = null;
if (hdr != null)
dict = hdr.getSequenceDictionary();
final PipelineOptions options = getAuthenticatedGCSOptions();
final ReferenceMultiSource referenceMultiSource = getReference();
Collection<SVKmer> killList = findBadGenomicKmers(ctx, kSize, maxDUSTScore, referenceMultiSource, options, dict);
if (highCopyFastaFilename != null) {
killList = uniquify(killList, processFasta(kSize, maxDUSTScore, highCopyFastaFilename, options));
}
SVUtils.writeKmersFile(kSize, outputFile, killList);
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method processFasta.
@VisibleForTesting
static List<SVKmer> processFasta(final int kSize, final int maxDUSTScore, final String fastaFilename, final PipelineOptions options) {
try (BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(fastaFilename)))) {
final List<SVKmer> kmers = new ArrayList<>((int) BucketUtils.fileSize(fastaFilename));
String line;
final StringBuilder sb = new StringBuilder();
final SVKmer kmerSeed = new SVKmerLong();
while ((line = rdr.readLine()) != null) {
if (line.charAt(0) != '>')
sb.append(line);
else if (sb.length() > 0) {
SVDUSTFilteredKmerizer.stream(sb, kSize, maxDUSTScore, kmerSeed).map(kmer -> kmer.canonical(kSize)).forEach(kmers::add);
sb.setLength(0);
}
}
if (sb.length() > 0) {
SVDUSTFilteredKmerizer.stream(sb, kSize, maxDUSTScore, kmerSeed).map(kmer -> kmer.canonical(kSize)).forEach(kmers::add);
}
return kmers;
} catch (IOException ioe) {
throw new GATKException("Can't read high copy kmers fasta file " + fastaFilename, ioe);
}
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class StructuralVariationDiscoveryPipelineSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final SAMFileHeader header = getHeaderForReads();
final PipelineOptions pipelineOptions = getAuthenticatedGCSOptions();
// gather evidence, run assembly, and align
final List<AlignedAssemblyOrExcuse> alignedAssemblyOrExcuseList = FindBreakpointEvidenceSpark.gatherEvidenceAndWriteContigSamFile(ctx, evidenceAndAssemblyArgs, header, getUnfilteredReads(), outputSAM, localLogger);
if (alignedAssemblyOrExcuseList.isEmpty())
return;
// parse the contig alignments and extract necessary information
@SuppressWarnings("unchecked") final JavaRDD<AlignedContig> parsedAlignments = new InMemoryAlignmentParser(ctx, alignedAssemblyOrExcuseList, header, localLogger).getAlignedContigs();
if (parsedAlignments.isEmpty())
return;
// discover variants and write to vcf
DiscoverVariantsFromContigAlignmentsSAMSpark.discoverVariantsAndWriteVCF(parsedAlignments, discoverStageArgs.fastaReference, ctx.broadcast(getReference()), pipelineOptions, vcfOutputFileName, localLogger);
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class ReferenceUtilsUnitTest method testLoadFastaDictionaryFromGCSBucket.
@Test(groups = { "bucket" })
public void testLoadFastaDictionaryFromGCSBucket() throws IOException {
final String bucketDictionary = getGCPTestInputPath() + "org/broadinstitute/hellbender/utils/ReferenceUtilsTest.dict";
final PipelineOptions popts = getAuthenticatedPipelineOptions();
try (final InputStream referenceDictionaryStream = BucketUtils.openFile(bucketDictionary)) {
final SAMSequenceDictionary dictionary = ReferenceUtils.loadFastaDictionary(referenceDictionaryStream);
Assert.assertNotNull(dictionary, "Sequence dictionary null after loading");
Assert.assertEquals(dictionary.size(), 4, "Wrong sequence dictionary size after loading");
}
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class BaseRecalibratorSparkSharded method hackilyCopyFromGCSIfNecessary.
// please add support for reading variant files from GCS.
private ArrayList<String> hackilyCopyFromGCSIfNecessary(List<String> localVariants) {
int i = 0;
Stopwatch hacking = Stopwatch.createStarted();
boolean copied = false;
ArrayList<String> ret = new ArrayList<>();
for (String v : localVariants) {
if (BucketUtils.isCloudStorageUrl(v)) {
if (!copied) {
logger.info("(HACK): copying the GCS variant file to local just so we can read it back.");
copied = true;
}
// this only works with the API_KEY, but then again it's a hack so there's no point in polishing it. Please don't make me.
PipelineOptions popts = auth.asPipelineOptionsDeprecated();
String d = IOUtils.createTempFile("knownVariants-" + i, ".vcf").getAbsolutePath();
try {
BucketUtils.copyFile(v, d);
} catch (IOException x) {
throw new UserException.CouldNotReadInputFile(v, x);
}
ret.add(d);
} else {
ret.add(v);
}
}
hacking.stop();
if (copied) {
logger.info("Copying the vcf took " + hacking.elapsed(TimeUnit.MILLISECONDS) + " ms.");
}
return ret;
}
Aggregations