Search in sources :

Example 1 with View

use of org.apache.beam.sdk.transforms.View in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class GameStats method main.

public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);
    // Read Events from Pub/Sub using custom timestamps
    PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())).apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
    // Extract username/score pairs from the event stream
    PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())).via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
    // Calculate the total score per user over fixed windows, and
    // cumulative updates for late data.
    final PCollectionView<Map<String, Integer>> spammersView = userEvents.apply("FixedWindowsUser", Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("CalculateSpammyUsers", new CalculateSpammyUsers()).apply("CreateSpammersView", View.<String, Integer>asMap());
    // [START DocInclude_FilterAndCalc]
    // Calculate the total score per team over fixed windows,
    // and emit cumulative updates for late data. Uses the side input derived above-- the set of
    // suspected robots-- to filter out scores from those users from the sum.
    // Write the results to BigQuery.
    rawEvents.apply("WindowIntoFixedWindows", Window.<GameActionInfo>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            // If the user is not in the spammers Map, output the data element.
            if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
                c.output(c.element());
            }
        }
    }).withSideInputs(spammersView)).apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite()));
    // [START DocInclude_SessionCalc]
    // Detect user sessions-- that is, a burst of activity separated by a gap from further
    // activity. Find and record the mean session lengths.
    // This information could help the game designers track the changing user engagement
    // as their set of games changes.
    userEvents.apply("WindowIntoSessions", Window.<KV<String, Integer>>into(Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))).withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)).apply(Combine.perKey(x -> 0)).apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())).apply("WindowToExtractSessionMean", Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))).apply(Mean.<Integer>globally().withoutDefaults()).apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite()));
    // [END DocInclude_Rewindow]
    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
}
Also used : KV(org.apache.beam.sdk.values.KV) DateTimeZone(org.joda.time.DateTimeZone) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) PipelineResult(org.apache.beam.sdk.PipelineResult) Default(org.apache.beam.sdk.options.Default) Combine(org.apache.beam.sdk.transforms.Combine) Duration(org.joda.time.Duration) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Metrics(org.apache.beam.sdk.metrics.Metrics) Description(org.apache.beam.sdk.options.Description) PTransform(org.apache.beam.sdk.transforms.PTransform) Sessions(org.apache.beam.sdk.transforms.windowing.Sessions) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) WriteWindowedToBigQuery(com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) DateTimeFormat(org.joda.time.format.DateTimeFormat) Logger(org.slf4j.Logger) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) TimeZone(java.util.TimeZone) Counter(org.apache.beam.sdk.metrics.Counter) Sum(org.apache.beam.sdk.transforms.Sum) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) PCollection(org.apache.beam.sdk.values.PCollection) Mean(org.apache.beam.sdk.transforms.Mean) ExampleUtils(com.google.cloud.dataflow.examples.common.ExampleUtils) PubsubIO(org.apache.beam.sdk.io.gcp.pubsub.PubsubIO) ParDo(org.apache.beam.sdk.transforms.ParDo) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Instant(org.joda.time.Instant) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Values(org.apache.beam.sdk.transforms.Values) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) ExampleUtils(com.google.cloud.dataflow.examples.common.ExampleUtils) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with View

use of org.apache.beam.sdk.transforms.View in project beam by apache.

the class QueryablePipelineTest method transformWithSideAndMainInputs.

/**
 * Tests that inputs that are only side inputs are not returned from {@link
 * QueryablePipeline#getPerElementConsumers(PCollectionNode)} and are returned from {@link
 * QueryablePipeline#getSideInputs(PTransformNode)}.
 */
@Test
public void transformWithSideAndMainInputs() {
    Pipeline p = Pipeline.create();
    PCollection<byte[]> impulse = p.apply("Impulse", Impulse.create());
    PCollectionView<String> view = p.apply("Create", Create.of("foo")).apply("View", View.asSingleton());
    impulse.apply("par_do", ParDo.of(new TestFn()).withSideInputs(view).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
    Components components = PipelineTranslation.toProto(p).getComponents();
    QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
    String mainInputName = getOnlyElement(PipelineNode.pTransform("Impulse", components.getTransformsOrThrow("Impulse")).getTransform().getOutputsMap().values());
    PCollectionNode mainInput = PipelineNode.pCollection(mainInputName, components.getPcollectionsOrThrow(mainInputName));
    PTransform parDoTransform = components.getTransformsOrThrow("par_do");
    String sideInputLocalName = getOnlyElement(parDoTransform.getInputsMap().entrySet().stream().filter(entry -> !entry.getValue().equals(mainInputName)).map(Map.Entry::getKey).collect(Collectors.toSet()));
    String sideInputCollectionId = parDoTransform.getInputsOrThrow(sideInputLocalName);
    PCollectionNode sideInput = PipelineNode.pCollection(sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId));
    PTransformNode parDoNode = PipelineNode.pTransform("par_do", components.getTransformsOrThrow("par_do"));
    SideInputReference sideInputRef = SideInputReference.of(parDoNode, sideInputLocalName, sideInput);
    assertThat(qp.getSideInputs(parDoNode), contains(sideInputRef));
    assertThat(qp.getPerElementConsumers(mainInput), contains(parDoNode));
    assertThat(qp.getPerElementConsumers(sideInput), not(contains(parDoNode)));
}
Also used : Count(org.apache.beam.sdk.transforms.Count) PBegin(org.apache.beam.sdk.values.PBegin) Matchers.not(org.hamcrest.Matchers.not) Matchers.hasKey(org.hamcrest.Matchers.hasKey) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) PCollectionList(org.apache.beam.sdk.values.PCollectionList) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Collection(java.util.Collection) Set(java.util.Set) Collectors(java.util.stream.Collectors) ParDo(org.apache.beam.sdk.transforms.ParDo) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Matchers.is(org.hamcrest.Matchers.is) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Impulse(org.apache.beam.sdk.transforms.Impulse) View(org.apache.beam.sdk.transforms.View) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) TupleTagList(org.apache.beam.sdk.values.TupleTagList) Environments(org.apache.beam.runners.core.construction.Environments) ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) Read(org.apache.beam.sdk.io.Read) TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Pipeline(org.apache.beam.sdk.Pipeline) ExpectedException(org.junit.rules.ExpectedException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) CountingSource(org.apache.beam.sdk.io.CountingSource) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) Rule(org.junit.Rule) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Iterables.getOnlyElement(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables.getOnlyElement) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Pipeline(org.apache.beam.sdk.Pipeline) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Aggregations

Map (java.util.Map)2 Pipeline (org.apache.beam.sdk.Pipeline)2 DoFn (org.apache.beam.sdk.transforms.DoFn)2 MapElements (org.apache.beam.sdk.transforms.MapElements)2 ParDo (org.apache.beam.sdk.transforms.ParDo)2 View (org.apache.beam.sdk.transforms.View)2 FixedWindows (org.apache.beam.sdk.transforms.windowing.FixedWindows)2 Window (org.apache.beam.sdk.transforms.windowing.Window)2 PCollection (org.apache.beam.sdk.values.PCollection)2 PCollectionView (org.apache.beam.sdk.values.PCollectionView)2 TypeDescriptors (org.apache.beam.sdk.values.TypeDescriptors)2 Duration (org.joda.time.Duration)2 ExampleUtils (com.google.cloud.dataflow.examples.common.ExampleUtils)1 WriteWindowedToBigQuery (com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 Set (java.util.Set)1 TimeZone (java.util.TimeZone)1 Collectors (java.util.stream.Collectors)1 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)1