use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class DoFnOutputTest method test.
@Test
public void test() throws Exception {
SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
options.setRunner(SparkPipelineRunner.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> strings = pipeline.apply(Create.of("a"));
// Test that values written from startBundle() and finishBundle() are written to
// the output
PCollection<String> output = strings.apply(ParDo.of(new DoFn<String, String>() {
@Override
public void startBundle(Context c) throws Exception {
c.output("start");
}
@Override
public void processElement(ProcessContext c) throws Exception {
c.output(c.element());
}
@Override
public void finishBundle(Context c) throws Exception {
c.output("finish");
}
}));
DataflowAssert.that(output).containsInAnyOrder("start", "a", "finish");
EvaluationResult res = SparkPipelineRunner.create().run(pipeline);
res.close();
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class EmptyInputTest method test.
@Test
public void test() throws Exception {
SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
Pipeline p = Pipeline.create(options);
List<String> empty = Collections.emptyList();
PCollection<String> inputWords = p.apply(Create.of(empty)).setCoder(StringUtf8Coder.of());
PCollection<String> output = inputWords.apply(Combine.globally(new ConcatWords()));
EvaluationResult res = SparkPipelineRunner.create().run(p);
assertEquals("", Iterables.getOnlyElement(res.get(output)));
res.close();
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class MultiOutputWordCountTest method testRun.
@Test
public void testRun() throws Exception {
Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
PCollection<String> regex = p.apply(Create.of("[^a-zA-Z']+"));
PCollection<String> w1 = p.apply(Create.of("Here are some words to count", "and some others"));
PCollection<String> w2 = p.apply(Create.of("Here are some more words", "and even more words"));
PCollectionList<String> list = PCollectionList.of(w1).and(w2);
PCollection<String> union = list.apply(Flatten.<String>pCollections());
PCollectionView<String> regexView = regex.apply(View.<String>asSingleton());
CountWords countWords = new CountWords(regexView);
PCollectionTuple luc = union.apply(countWords);
PCollection<Long> unique = luc.get(lowerCnts).apply(ApproximateUnique.<KV<String, Long>>globally(16));
EvaluationResult res = SparkPipelineRunner.create().run(p);
Iterable<KV<String, Long>> actualLower = res.get(luc.get(lowerCnts));
Assert.assertEquals("are", actualLower.iterator().next().getKey());
Iterable<KV<String, Long>> actualUpper = res.get(luc.get(upperCnts));
Assert.assertEquals("Here", actualUpper.iterator().next().getKey());
Iterable<Long> actualUniqCount = res.get(unique);
Assert.assertEquals(9, (long) actualUniqCount.iterator().next());
int actualTotalWords = res.getAggregatorValue("totalWords", Integer.class);
Assert.assertEquals(18, actualTotalWords);
int actualMaxWordLength = res.getAggregatorValue("maxWordLength", Integer.class);
Assert.assertEquals(6, actualMaxWordLength);
AggregatorValues<Integer> aggregatorValues = res.getAggregatorValues(countWords.getTotalWordsAggregator());
Assert.assertEquals(18, Iterables.getOnlyElement(aggregatorValues.getValues()).intValue());
res.close();
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class KafkaStreamingTest method testRun.
@Test
public void testRun() throws Exception {
// test read from Kafka
SparkStreamingPipelineOptions options = SparkStreamingPipelineOptionsFactory.create();
options.setAppName(this.getClass().getSimpleName());
options.setRunner(SparkPipelineRunner.class);
// run for one interval
options.setTimeout(TEST_TIMEOUT_MSEC);
Pipeline p = Pipeline.create(options);
Map<String, String> kafkaParams = ImmutableMap.of("metadata.broker.list", EMBEDDED_KAFKA_CLUSTER.getBrokerList(), "auto.offset.reset", "smallest");
PCollection<KV<String, String>> kafkaInput = p.apply(KafkaIO.Read.from(StringDecoder.class, StringDecoder.class, String.class, String.class, Collections.singleton(TOPIC), kafkaParams));
PCollection<KV<String, String>> windowedWords = kafkaInput.apply(Window.<KV<String, String>>into(FixedWindows.of(Duration.standardSeconds(1))));
PCollection<String> formattedKV = windowedWords.apply(ParDo.of(new FormatKVFn()));
DataflowAssert.thatIterable(formattedKV.apply(View.<String>asIterable())).containsInAnyOrder(EXPECTED);
EvaluationResult res = SparkPipelineRunner.create(options).run(p);
res.close();
DataflowAssertStreaming.assertNoFailures(res);
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class CombineGloballyTest method test.
@Test
public void test() throws Exception {
SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
Pipeline p = Pipeline.create(options);
PCollection<String> inputWords = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());
PCollection<String> output = inputWords.apply(Combine.globally(new WordMerger()));
EvaluationResult res = SparkPipelineRunner.create().run(p);
assertEquals("hi there,hi,hi sue bob,hi sue,,bob hi", Iterables.getOnlyElement(res.get(output)));
res.close();
}
Aggregations