use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class TransformTranslatorTest method runPipeline.
private String runPipeline(String name, PipelineRunner<?> runner) {
Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
String outFile = Joiner.on(File.separator).join(testDataDirName, "test_text_out_" + name);
PCollection<String> lines = p.apply(TextIO.Read.from("src/test/resources/test_text.txt"));
lines.apply(TextIO.Write.to(outFile));
runner.run(p);
return outFile;
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class HadoopFileFormatPipelineTest method testSequenceFile.
@Test
public void testSequenceFile() throws Exception {
populateFile();
Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
@SuppressWarnings("unchecked") Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass = (Class<? extends FileInputFormat<IntWritable, Text>>) (Class<?>) SequenceFileInputFormat.class;
HadoopIO.Read.Bound<IntWritable, Text> read = HadoopIO.Read.from(inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class);
PCollection<KV<IntWritable, Text>> input = p.apply(read);
@SuppressWarnings("unchecked") Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass = (Class<? extends FileOutputFormat<IntWritable, Text>>) (Class<?>) TemplatedSequenceFileOutputFormat.class;
@SuppressWarnings("unchecked") HadoopIO.Write.Bound<IntWritable, Text> write = HadoopIO.Write.to(outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class);
input.apply(write.withoutSharding());
EvaluationResult res = SparkPipelineRunner.create().run(p);
res.close();
IntWritable key = new IntWritable();
Text value = new Text();
try (Reader reader = new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) {
int i = 0;
while (reader.next(key, value)) {
assertEquals(i, key.get());
assertEquals("value-" + i, value.toString());
i++;
}
}
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class NumShardsTest method testText.
@Test
public void testText() throws Exception {
SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
options.setRunner(SparkPipelineRunner.class);
Pipeline p = Pipeline.create(options);
PCollection<String> inputWords = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());
PCollection<String> output = inputWords.apply(new WordCount.CountWords()).apply(MapElements.via(new WordCount.FormatAsTextFn()));
output.apply(TextIO.Write.to(outputDir.getAbsolutePath()).withNumShards(3).withSuffix(".txt"));
EvaluationResult res = SparkPipelineRunner.create().run(p);
res.close();
int count = 0;
Set<String> expected = Sets.newHashSet("hi: 5", "there: 1", "sue: 2", "bob: 2");
for (File f : tmpDir.getRoot().listFiles(new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.getName().matches("out-.*\\.txt");
}
})) {
count++;
for (String line : Files.readLines(f, Charsets.UTF_8)) {
assertTrue(line + " not found", expected.remove(line));
}
}
assertEquals(3, count);
assertTrue(expected.isEmpty());
}
use of com.google.cloud.dataflow.sdk.Pipeline in project gatk by broadinstitute.
the class ReferenceAPISourceUnitTest method testCreateByAssemblyIDMultipleReferenceSets.
@Test(groups = "cloud", dataProvider = "assemblyIDDataMultiple", expectedExceptions = UserException.MultipleReferenceSets.class)
public void testCreateByAssemblyIDMultipleReferenceSets(final String assemblyID, final String refID) throws Exception {
final Pipeline p = setupPipeline();
final ReferenceAPISource apiSourceByAssemblyName = ReferenceAPISource.fromReferenceSetAssemblyID(p.getOptions(), assemblyID);
final ReferenceAPISource apiSourceByID = makeReferenceAPISource(refID, p);
Assert.assertEquals(apiSourceByAssemblyName.getReferenceMap(), apiSourceByID.getReferenceMap());
}
use of com.google.cloud.dataflow.sdk.Pipeline in project gatk by broadinstitute.
the class ReferenceAPISourceUnitTest method queryReferenceAPI.
private ReferenceBases queryReferenceAPI(final String referenceName, final SimpleInterval interval) {
final Pipeline p = setupPipeline();
ReferenceAPISource refAPISource = makeReferenceAPISource(referenceName, p);
return refAPISource.getReferenceBases(p.getOptions(), interval);
}
Aggregations