Search in sources :

Example 1 with FlatMap

use of org.apache.beam.sdk.extensions.euphoria.core.client.operator.FlatMap in project beam by apache.

the class DocumentationExamplesTest method wordCountExample.

@Ignore("We do not want to actually write output files from this test.")
@Test
public void wordCountExample() {
    final PipelineOptions options = PipelineOptionsFactory.create();
    Pipeline pipeline = Pipeline.create(options);
    // Use Kryo as coder fallback
    KryoCoderProvider.of().registerTo(pipeline);
    // Source of data loaded from Beam IO.
    PCollection<String> lines = pipeline.apply(Create.of(textLineByLine)).setTypeDescriptor(TypeDescriptor.of(String.class));
    // FlatMap processes one input element at a time and allows user code to emit
    // zero, one, or more output elements. From input lines we will get data set of words.
    PCollection<String> words = FlatMap.named("TOKENIZER").of(lines).using((String line, Collector<String> context) -> {
        for (String word : Splitter.onPattern("\\s+").split(line)) {
            context.collect(word);
        }
    }).output();
    // Now we can count input words - the operator ensures that all values for the same
    // key (word in this case) end up being processed together. Then it counts number of appearances
    // of the same key in 'words' dataset and emits it to output.
    PCollection<KV<String, Long>> counted = CountByKey.named("COUNT").of(words).keyBy(w -> w).output();
    // Format output.
    PCollection<String> output = MapElements.named("FORMAT").of(counted).using(p -> p.getKey() + ": " + p.getValue()).output();
    // Now we can again use Beam transformation. In this case we save words and their count
    // into the text file.
    output.apply(TextIO.write().to("counted_words"));
    pipeline.run();
}
Also used : Filter(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Filter) Arrays(java.util.Arrays) Distinct(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Distinct) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) ReduceWindow(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceWindow) SumByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.SumByKey) Join(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Join) Union(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Union) GenericTranslatorProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.provider.GenericTranslatorProvider) RightJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.RightJoin) Create(org.apache.beam.sdk.transforms.Create) Arrays.asList(java.util.Arrays.asList) LeftJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.LeftJoin) ReduceByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceByKey) KryoOptions(org.apache.beam.sdk.extensions.kryo.KryoOptions) Fold(org.apache.beam.sdk.extensions.euphoria.core.client.util.Fold) Triple(org.apache.beam.sdk.extensions.euphoria.core.client.util.Triple) CompositeProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.provider.CompositeProvider) CountByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.CountByKey) Serializable(java.io.Serializable) List(java.util.List) Stream(java.util.stream.Stream) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) AssignEventTime(org.apache.beam.sdk.extensions.euphoria.core.client.operator.AssignEventTime) Optional(java.util.Optional) BroadcastHashJoinTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.BroadcastHashJoinTranslator) Collector(org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) FullJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.FullJoin) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Splitter(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter) MapElements(org.apache.beam.sdk.extensions.euphoria.core.client.operator.MapElements) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Before(org.junit.Before) PAssert(org.apache.beam.sdk.testing.PAssert) FlatMapTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.FlatMapTranslator) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) UnaryFunction(org.apache.beam.sdk.extensions.euphoria.core.client.functional.UnaryFunction) Operator(org.apache.beam.sdk.extensions.euphoria.core.client.operator.base.Operator) PCollection(org.apache.beam.sdk.values.PCollection) TopPerKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.TopPerKey) OperatorTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.OperatorTranslator) CompositeOperator(org.apache.beam.sdk.extensions.euphoria.core.client.operator.CompositeOperator) Rule(org.junit.Rule) Ignore(org.junit.Ignore) CompositeOperatorTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.CompositeOperatorTranslator) TranslatorProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.TranslatorProvider) OnTimeBehavior(org.apache.beam.sdk.transforms.windowing.Window.OnTimeBehavior) FlatMap(org.apache.beam.sdk.extensions.euphoria.core.client.operator.FlatMap) KryoCoderProvider(org.apache.beam.sdk.extensions.kryo.KryoCoderProvider) Assert(org.junit.Assert) TextIO(org.apache.beam.sdk.io.TextIO) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) KV(org.apache.beam.sdk.values.KV) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Serializable (java.io.Serializable)1 Arrays (java.util.Arrays)1 Arrays.asList (java.util.Arrays.asList)1 List (java.util.List)1 Optional (java.util.Optional)1 Stream (java.util.stream.Stream)1 Pipeline (org.apache.beam.sdk.Pipeline)1 UnaryFunction (org.apache.beam.sdk.extensions.euphoria.core.client.functional.UnaryFunction)1 Collector (org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector)1 AssignEventTime (org.apache.beam.sdk.extensions.euphoria.core.client.operator.AssignEventTime)1 CompositeOperator (org.apache.beam.sdk.extensions.euphoria.core.client.operator.CompositeOperator)1 CountByKey (org.apache.beam.sdk.extensions.euphoria.core.client.operator.CountByKey)1 Distinct (org.apache.beam.sdk.extensions.euphoria.core.client.operator.Distinct)1 Filter (org.apache.beam.sdk.extensions.euphoria.core.client.operator.Filter)1 FlatMap (org.apache.beam.sdk.extensions.euphoria.core.client.operator.FlatMap)1 FullJoin (org.apache.beam.sdk.extensions.euphoria.core.client.operator.FullJoin)1 Join (org.apache.beam.sdk.extensions.euphoria.core.client.operator.Join)1 LeftJoin (org.apache.beam.sdk.extensions.euphoria.core.client.operator.LeftJoin)1 MapElements (org.apache.beam.sdk.extensions.euphoria.core.client.operator.MapElements)1 ReduceByKey (org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceByKey)1