use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class CombinePerKeyTest method testRun.
@Test
public void testRun() {
Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
PCollection<String> inputWords = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());
PCollection<KV<String, Long>> cnts = inputWords.apply(new SumPerKey<String>());
EvaluationResult res = SparkPipelineRunner.create().run(p);
Map<String, Long> actualCnts = new HashMap<>();
for (KV<String, Long> kv : res.get(cnts)) {
actualCnts.put(kv.getKey(), kv.getValue());
}
res.close();
Assert.assertEquals(8, actualCnts.size());
Assert.assertEquals(Long.valueOf(2L), actualCnts.get("the"));
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class DeDupTest method testRun.
@Test
public void testRun() throws Exception {
SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
options.setRunner(SparkPipelineRunner.class);
Pipeline p = Pipeline.create(options);
PCollection<String> input = p.apply(Create.of(LINES)).setCoder(StringUtf8Coder.of());
PCollection<String> output = input.apply(RemoveDuplicates.<String>create());
DataflowAssert.that(output).containsInAnyOrder(EXPECTED_SET);
EvaluationResult res = SparkPipelineRunner.create().run(p);
res.close();
}
use of com.google.cloud.dataflow.sdk.Pipeline in project spark-dataflow by cloudera.
the class SerializationTest method testRun.
@Test
public void testRun() throws Exception {
SparkPipelineOptions options = SparkPipelineOptionsFactory.create();
options.setRunner(SparkPipelineRunner.class);
Pipeline p = Pipeline.create(options);
PCollection<StringHolder> inputWords = p.apply(Create.of(WORDS).withCoder(StringHolderUtf8Coder.of()));
PCollection<StringHolder> output = inputWords.apply(new CountWords());
DataflowAssert.that(output).containsInAnyOrder(EXPECTED_COUNT_SET);
EvaluationResult res = SparkPipelineRunner.create().run(p);
res.close();
}
Aggregations