use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method processFasta.
@VisibleForTesting
static List<SVKmer> processFasta(final int kSize, final int maxDUSTScore, final String fastaFilename, final PipelineOptions options) {
try (BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(fastaFilename)))) {
final List<SVKmer> kmers = new ArrayList<>((int) BucketUtils.fileSize(fastaFilename));
String line;
final StringBuilder sb = new StringBuilder();
final SVKmer kmerSeed = new SVKmerLong();
while ((line = rdr.readLine()) != null) {
if (line.charAt(0) != '>')
sb.append(line);
else if (sb.length() > 0) {
SVDUSTFilteredKmerizer.stream(sb, kSize, maxDUSTScore, kmerSeed).map(kmer -> kmer.canonical(kSize)).forEach(kmers::add);
sb.setLength(0);
}
}
if (sb.length() > 0) {
SVDUSTFilteredKmerizer.stream(sb, kSize, maxDUSTScore, kmerSeed).map(kmer -> kmer.canonical(kSize)).forEach(kmers::add);
}
return kmers;
} catch (IOException ioe) {
throw new GATKException("Can't read high copy kmers fasta file " + fastaFilename, ioe);
}
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class StructuralVariationDiscoveryPipelineSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final SAMFileHeader header = getHeaderForReads();
final PipelineOptions pipelineOptions = getAuthenticatedGCSOptions();
// gather evidence, run assembly, and align
final List<AlignedAssemblyOrExcuse> alignedAssemblyOrExcuseList = FindBreakpointEvidenceSpark.gatherEvidenceAndWriteContigSamFile(ctx, evidenceAndAssemblyArgs, header, getUnfilteredReads(), outputSAM, localLogger);
if (alignedAssemblyOrExcuseList.isEmpty())
return;
// parse the contig alignments and extract necessary information
@SuppressWarnings("unchecked") final JavaRDD<AlignedContig> parsedAlignments = new InMemoryAlignmentParser(ctx, alignedAssemblyOrExcuseList, header, localLogger).getAlignedContigs();
if (parsedAlignments.isEmpty())
return;
// discover variants and write to vcf
DiscoverVariantsFromContigAlignmentsSAMSpark.discoverVariantsAndWriteVCF(parsedAlignments, discoverStageArgs.fastaReference, ctx.broadcast(getReference()), pipelineOptions, vcfOutputFileName, localLogger);
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class ReferenceUtilsUnitTest method testLoadFastaDictionaryFromGCSBucket.
@Test(groups = { "bucket" })
public void testLoadFastaDictionaryFromGCSBucket() throws IOException {
final String bucketDictionary = getGCPTestInputPath() + "org/broadinstitute/hellbender/utils/ReferenceUtilsTest.dict";
final PipelineOptions popts = getAuthenticatedPipelineOptions();
try (final InputStream referenceDictionaryStream = BucketUtils.openFile(bucketDictionary)) {
final SAMSequenceDictionary dictionary = ReferenceUtils.loadFastaDictionary(referenceDictionaryStream);
Assert.assertNotNull(dictionary, "Sequence dictionary null after loading");
Assert.assertEquals(dictionary.size(), 4, "Wrong sequence dictionary size after loading");
}
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class BucketUtilsTest method testCopyAndDeleteHDFS.
@Test
public void testCopyAndDeleteHDFS() throws Exception {
final String src = publicTestDir + "empty.vcf";
File dest = createTempFile("copy-empty", ".vcf");
MiniClusterUtils.runOnIsolatedMiniCluster(cluster -> {
final String intermediate = BucketUtils.randomRemotePath(MiniClusterUtils.getWorkingDir(cluster).toString(), "test-copy-empty", ".vcf");
Assert.assertTrue(BucketUtils.isHadoopUrl(intermediate), "!BucketUtils.isHadoopUrl(intermediate)");
PipelineOptions popts = null;
BucketUtils.copyFile(src, intermediate);
BucketUtils.copyFile(intermediate, dest.getPath());
IOUtil.assertFilesEqual(new File(src), dest);
Assert.assertTrue(BucketUtils.fileExists(intermediate));
BucketUtils.deleteFile(intermediate);
Assert.assertFalse(BucketUtils.fileExists(intermediate));
});
}
use of com.google.cloud.dataflow.sdk.options.PipelineOptions in project gatk by broadinstitute.
the class PathSeqFilterSpark method doKmerFiltering.
@SuppressWarnings("unchecked")
private JavaRDD<GATKRead> doKmerFiltering(final JavaSparkContext ctx, final JavaRDD<GATKRead> reads) {
final PipelineOptions options = getAuthenticatedGCSOptions();
Input input = new Input(BucketUtils.openFile(KMER_LIB_PATH));
Kryo kryo = new Kryo();
kryo.setReferences(false);
Set<SVKmer> kmerLibSet = (HopscotchSet<SVKmer>) kryo.readClassAndObject(input);
return reads.filter(new ContainsKmerReadFilterSpark(ctx.broadcast(kmerLibSet), KMER_SIZE));
}
Aggregations