Search in sources :

Example 1 with Collector

use of org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector in project beam by apache.

the class DocumentationExamplesTest method wordCountExample.

@Ignore("We do not want to actually write output files from this test.")
@Test
public void wordCountExample() {
    final PipelineOptions options = PipelineOptionsFactory.create();
    Pipeline pipeline = Pipeline.create(options);
    // Use Kryo as coder fallback
    KryoCoderProvider.of().registerTo(pipeline);
    // Source of data loaded from Beam IO.
    PCollection<String> lines = pipeline.apply(Create.of(textLineByLine)).setTypeDescriptor(TypeDescriptor.of(String.class));
    // FlatMap processes one input element at a time and allows user code to emit
    // zero, one, or more output elements. From input lines we will get data set of words.
    PCollection<String> words = FlatMap.named("TOKENIZER").of(lines).using((String line, Collector<String> context) -> {
        for (String word : Splitter.onPattern("\\s+").split(line)) {
            context.collect(word);
        }
    }).output();
    // Now we can count input words - the operator ensures that all values for the same
    // key (word in this case) end up being processed together. Then it counts number of appearances
    // of the same key in 'words' dataset and emits it to output.
    PCollection<KV<String, Long>> counted = CountByKey.named("COUNT").of(words).keyBy(w -> w).output();
    // Format output.
    PCollection<String> output = MapElements.named("FORMAT").of(counted).using(p -> p.getKey() + ": " + p.getValue()).output();
    // Now we can again use Beam transformation. In this case we save words and their count
    // into the text file.
    output.apply(TextIO.write().to("counted_words"));
    pipeline.run();
}
Also used : Filter(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Filter) Arrays(java.util.Arrays) Distinct(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Distinct) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) ReduceWindow(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceWindow) SumByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.SumByKey) Join(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Join) Union(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Union) GenericTranslatorProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.provider.GenericTranslatorProvider) RightJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.RightJoin) Create(org.apache.beam.sdk.transforms.Create) Arrays.asList(java.util.Arrays.asList) LeftJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.LeftJoin) ReduceByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceByKey) KryoOptions(org.apache.beam.sdk.extensions.kryo.KryoOptions) Fold(org.apache.beam.sdk.extensions.euphoria.core.client.util.Fold) Triple(org.apache.beam.sdk.extensions.euphoria.core.client.util.Triple) CompositeProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.provider.CompositeProvider) CountByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.CountByKey) Serializable(java.io.Serializable) List(java.util.List) Stream(java.util.stream.Stream) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) AssignEventTime(org.apache.beam.sdk.extensions.euphoria.core.client.operator.AssignEventTime) Optional(java.util.Optional) BroadcastHashJoinTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.BroadcastHashJoinTranslator) Collector(org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) FullJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.FullJoin) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Splitter(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter) MapElements(org.apache.beam.sdk.extensions.euphoria.core.client.operator.MapElements) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Before(org.junit.Before) PAssert(org.apache.beam.sdk.testing.PAssert) FlatMapTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.FlatMapTranslator) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) UnaryFunction(org.apache.beam.sdk.extensions.euphoria.core.client.functional.UnaryFunction) Operator(org.apache.beam.sdk.extensions.euphoria.core.client.operator.base.Operator) PCollection(org.apache.beam.sdk.values.PCollection) TopPerKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.TopPerKey) OperatorTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.OperatorTranslator) CompositeOperator(org.apache.beam.sdk.extensions.euphoria.core.client.operator.CompositeOperator) Rule(org.junit.Rule) Ignore(org.junit.Ignore) CompositeOperatorTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.CompositeOperatorTranslator) TranslatorProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.TranslatorProvider) OnTimeBehavior(org.apache.beam.sdk.transforms.windowing.Window.OnTimeBehavior) FlatMap(org.apache.beam.sdk.extensions.euphoria.core.client.operator.FlatMap) KryoCoderProvider(org.apache.beam.sdk.extensions.kryo.KryoCoderProvider) Assert(org.junit.Assert) TextIO(org.apache.beam.sdk.io.TextIO) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) KV(org.apache.beam.sdk.values.KV) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with Collector

use of org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector in project beam by apache.

the class JoinTest method testBuild_OptionalWindowing.

@Test
public void testBuild_OptionalWindowing() {
    final Pipeline pipeline = TestUtils.createTestPipeline();
    final PCollection<String> left = TestUtils.createMockDataset(pipeline, TypeDescriptors.strings());
    final PCollection<String> right = TestUtils.createMockDataset(pipeline, TypeDescriptors.strings());
    final PCollection<KV<Integer, String>> joined = Join.named("Join1").of(left, right).by(String::length, String::length).using((String l, String r, Collector<String> c) -> c.collect(l + r)).applyIf(true, b -> b.windowBy(FixedWindows.of(org.joda.time.Duration.standardHours(1))).triggeredBy(AfterWatermark.pastEndOfWindow()).accumulationMode(AccumulationMode.DISCARDING_FIRED_PANES)).output();
    final Join join = (Join) TestUtils.getProducer(joined);
    assertTrue(join.getWindow().isPresent());
    final Window<?> window = (Window) join.getWindow().get();
    assertEquals(FixedWindows.of(org.joda.time.Duration.standardHours(1)), window.getWindowFn());
    assertEquals(AfterWatermark.pastEndOfWindow(), WindowDesc.of(window).getTrigger());
    assertEquals(AccumulationMode.DISCARDING_FIRED_PANES, WindowDesc.of(window).getAccumulationMode());
}
Also used : KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) AccumulationMode(org.apache.beam.sdk.values.WindowingStrategy.AccumulationMode) AfterWatermark(org.apache.beam.sdk.transforms.windowing.AfterWatermark) Assert.assertNotNull(org.junit.Assert.assertNotNull) WindowDesc(org.apache.beam.sdk.transforms.windowing.WindowDesc) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Assert.assertTrue(org.junit.Assert.assertTrue) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Rule(org.junit.Rule) Assert.assertFalse(org.junit.Assert.assertFalse) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) TypePropagationAssert(org.apache.beam.sdk.extensions.euphoria.core.client.type.TypePropagationAssert) Window(org.apache.beam.sdk.transforms.windowing.Window) Optional(java.util.Optional) Pipeline(org.apache.beam.sdk.Pipeline) Assert.assertEquals(org.junit.Assert.assertEquals) Collector(org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector) Window(org.apache.beam.sdk.transforms.windowing.Window) Collector(org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector) KV(org.apache.beam.sdk.values.KV) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 3 with Collector

use of org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector in project beam by apache.

the class BeamMetricsTranslationTest method testBeamMetricsTranslation.

/**
 * Test metrics counters on {@link ReduceByKey} and {@link MapElements} operators Flow:
 *
 * <ol>
 *   <li>step RBK increment for all keys, add to histogram its value, collect even numbers.
 *   <li>step MapElements increment for every element, add to histogram its value, map to integer.
 *   <li>tep test MapElements with default operator name, increment by value of its element, add
 *       to histogram 2 times value of its element.
 * </ol>
 */
@Test
public void testBeamMetricsTranslation() {
    final PCollection<Integer> input = testPipeline.apply("input", Create.of(1, 2, 3, 4, 5).withType(TypeDescriptors.integers()));
    final String counterName1 = "counter1";
    final String operatorName1 = "count_elements_and_save_even_numbers";
    final PCollection<KV<Integer, Integer>> kvInput = ReduceByKey.named(operatorName1).of(input).keyBy(e -> e).reduceBy((Stream<Integer> list, Collector<Integer> coll) -> list.forEach(i -> {
        coll.getCounter(counterName1).increment();
        coll.getHistogram(counterName1).add(i);
        if (i % 2 == 0) {
            coll.collect(i);
        }
    })).output();
    final String counterName2 = "counter2";
    final String operatorName2 = "map_to_integer";
    final String operatorName3 = "map_elements";
    final PCollection<Integer> mapElementsOutput = MapElements.named(operatorName2).of(// kvInput = [<2,2>, <4,4>]
    kvInput).using((kv, context) -> {
        final Integer value = kv.getValue();
        context.getCounter(counterName2).increment();
        context.getHistogram(counterName2).add(value);
        return value;
    }).output();
    final PCollection<Integer> output = MapElements.named(operatorName3).of(// mapElementsOutput = [2,4]
    mapElementsOutput).using((value, context) -> {
        context.getCounter(counterName2).increment(value);
        context.getHistogram(counterName2).add(value, 2);
        return value;
    }).output();
    PAssert.that(output).containsInAnyOrder(2, 4);
    final PipelineResult result = testPipeline.run();
    result.waitUntilFinish();
    final MetricQueryResults metricQueryResults = result.metrics().queryMetrics(MetricsFilter.builder().addNameFilter(MetricNameFilter.inNamespace(operatorName1)).addNameFilter(MetricNameFilter.inNamespace(operatorName2)).addNameFilter(MetricNameFilter.inNamespace(operatorName3)).build());
    testStep1Metrics(metricQueryResults, counterName1, operatorName1);
    testStep2Metrics(metricQueryResults, counterName2, operatorName2);
    testStep3WithDefaultOperatorName(metricQueryResults, counterName2, operatorName3);
}
Also used : ReduceByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceByKey) MetricNameFilter(org.apache.beam.sdk.metrics.MetricNameFilter) KV(org.apache.beam.sdk.values.KV) MetricResultsMatchers.metricsResult(org.apache.beam.sdk.metrics.MetricResultsMatchers.metricsResult) PAssert(org.apache.beam.sdk.testing.PAssert) PipelineResult(org.apache.beam.sdk.PipelineResult) RunWith(org.junit.runner.RunWith) Matchers(org.hamcrest.Matchers) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) MapElements(org.apache.beam.sdk.extensions.euphoria.core.client.operator.MapElements) PCollection(org.apache.beam.sdk.values.PCollection) MetricsFilter(org.apache.beam.sdk.metrics.MetricsFilter) Stream(java.util.stream.Stream) MatcherAssert(org.hamcrest.MatcherAssert) Rule(org.junit.Rule) DistributionResult(org.apache.beam.sdk.metrics.DistributionResult) Create(org.apache.beam.sdk.transforms.Create) MetricQueryResults(org.apache.beam.sdk.metrics.MetricQueryResults) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) KryoCoder(org.apache.beam.sdk.extensions.kryo.KryoCoder) Collector(org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector) Before(org.junit.Before) MetricQueryResults(org.apache.beam.sdk.metrics.MetricQueryResults) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 4 with Collector

use of org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector in project beam by apache.

the class DocumentationExamplesTest method metricsAndAccumulatorsSection.

@Test
public void metricsAndAccumulatorsSection() {
    final PipelineOptions options = PipelineOptionsFactory.create();
    Pipeline pipeline = Pipeline.create(options);
    PCollection<String> dataset = pipeline.apply(Create.of("a", "x"));
    FlatMap.named("FlatMap1").of(dataset).using((String value, Collector<String> context) -> {
        context.getCounter("my-counter").increment();
        context.collect(value);
    }).output();
    MapElements.named("MapThem").of(dataset).using((value, context) -> {
        // use simple counter
        context.getCounter("my-counter").increment();
        return value.toLowerCase();
    }).output();
}
Also used : Filter(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Filter) Arrays(java.util.Arrays) Distinct(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Distinct) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) ReduceWindow(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceWindow) SumByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.SumByKey) Join(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Join) Union(org.apache.beam.sdk.extensions.euphoria.core.client.operator.Union) GenericTranslatorProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.provider.GenericTranslatorProvider) RightJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.RightJoin) Create(org.apache.beam.sdk.transforms.Create) Arrays.asList(java.util.Arrays.asList) LeftJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.LeftJoin) ReduceByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceByKey) KryoOptions(org.apache.beam.sdk.extensions.kryo.KryoOptions) Fold(org.apache.beam.sdk.extensions.euphoria.core.client.util.Fold) Triple(org.apache.beam.sdk.extensions.euphoria.core.client.util.Triple) CompositeProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.provider.CompositeProvider) CountByKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.CountByKey) Serializable(java.io.Serializable) List(java.util.List) Stream(java.util.stream.Stream) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) AssignEventTime(org.apache.beam.sdk.extensions.euphoria.core.client.operator.AssignEventTime) Optional(java.util.Optional) BroadcastHashJoinTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.BroadcastHashJoinTranslator) Collector(org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) FullJoin(org.apache.beam.sdk.extensions.euphoria.core.client.operator.FullJoin) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Splitter(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter) MapElements(org.apache.beam.sdk.extensions.euphoria.core.client.operator.MapElements) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Before(org.junit.Before) PAssert(org.apache.beam.sdk.testing.PAssert) FlatMapTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.FlatMapTranslator) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) UnaryFunction(org.apache.beam.sdk.extensions.euphoria.core.client.functional.UnaryFunction) Operator(org.apache.beam.sdk.extensions.euphoria.core.client.operator.base.Operator) PCollection(org.apache.beam.sdk.values.PCollection) TopPerKey(org.apache.beam.sdk.extensions.euphoria.core.client.operator.TopPerKey) OperatorTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.OperatorTranslator) CompositeOperator(org.apache.beam.sdk.extensions.euphoria.core.client.operator.CompositeOperator) Rule(org.junit.Rule) Ignore(org.junit.Ignore) CompositeOperatorTranslator(org.apache.beam.sdk.extensions.euphoria.core.translate.CompositeOperatorTranslator) TranslatorProvider(org.apache.beam.sdk.extensions.euphoria.core.translate.TranslatorProvider) OnTimeBehavior(org.apache.beam.sdk.transforms.windowing.Window.OnTimeBehavior) FlatMap(org.apache.beam.sdk.extensions.euphoria.core.client.operator.FlatMap) KryoCoderProvider(org.apache.beam.sdk.extensions.kryo.KryoCoderProvider) Assert(org.junit.Assert) TextIO(org.apache.beam.sdk.io.TextIO) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

Collector (org.apache.beam.sdk.extensions.euphoria.core.client.io.Collector)4 Optional (java.util.Optional)3 Stream (java.util.stream.Stream)3 Pipeline (org.apache.beam.sdk.Pipeline)3 MapElements (org.apache.beam.sdk.extensions.euphoria.core.client.operator.MapElements)3 ReduceByKey (org.apache.beam.sdk.extensions.euphoria.core.client.operator.ReduceByKey)3 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)3 KV (org.apache.beam.sdk.values.KV)3 PCollection (org.apache.beam.sdk.values.PCollection)3 TypeDescriptors (org.apache.beam.sdk.values.TypeDescriptors)3 Rule (org.junit.Rule)3 Test (org.junit.Test)3 RunWith (org.junit.runner.RunWith)3 JUnit4 (org.junit.runners.JUnit4)3 Serializable (java.io.Serializable)2 Arrays (java.util.Arrays)2 Arrays.asList (java.util.Arrays.asList)2 List (java.util.List)2 UnaryFunction (org.apache.beam.sdk.extensions.euphoria.core.client.functional.UnaryFunction)2 AssignEventTime (org.apache.beam.sdk.extensions.euphoria.core.client.operator.AssignEventTime)2