Search in sources :

Example 16 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class CalculateSchemas method expand.

@Override
public PCollectionView<Map<DestinationT, String>> expand(PCollection<KV<DestinationT, TableRow>> input) {
    List<PCollectionView<?>> sideInputs = Lists.newArrayList();
    sideInputs.addAll(dynamicDestinations.getSideInputs());
    return input.apply("Keys", Keys.<DestinationT>create()).apply("Distinct Keys", Distinct.<DestinationT>create()).apply("GetSchemas", ParDo.of(new DoFn<DestinationT, KV<DestinationT, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            dynamicDestinations.setSideInputAccessorFromProcessContext(c);
            TableSchema tableSchema = dynamicDestinations.getSchema(c.element());
            if (tableSchema != null) {
                // If the createDisposition is CREATE_NEVER, then there's no need for a
                // schema, and getSchema might return null. In this case, we simply
                // leave it out of the map.
                c.output(KV.of(c.element(), BigQueryHelpers.toJsonString(tableSchema)));
            }
        }
    }).withSideInputs(sideInputs)).apply("asMap", View.<DestinationT, String>asMap());
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) TableSchema(com.google.api.services.bigquery.model.TableSchema)

Example 17 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class GameStats method main.

public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);
    // Read Events from Pub/Sub using custom timestamps
    PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())).apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
    // Extract username/score pairs from the event stream
    PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())).via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
    // Calculate the total score per user over fixed windows, and
    // cumulative updates for late data.
    final PCollectionView<Map<String, Integer>> spammersView = userEvents.apply("FixedWindowsUser", Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("CalculateSpammyUsers", new CalculateSpammyUsers()).apply("CreateSpammersView", View.<String, Integer>asMap());
    // [START DocInclude_FilterAndCalc]
    // Calculate the total score per team over fixed windows,
    // and emit cumulative updates for late data. Uses the side input derived above-- the set of
    // suspected robots-- to filter out scores from those users from the sum.
    // Write the results to BigQuery.
    rawEvents.apply("WindowIntoFixedWindows", Window.<GameActionInfo>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            // If the user is not in the spammers Map, output the data element.
            if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
                c.output(c.element());
            }
        }
    }).withSideInputs(spammersView)).apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite()));
    // [START DocInclude_SessionCalc]
    // Detect user sessions-- that is, a burst of activity separated by a gap from further
    // activity. Find and record the mean session lengths.
    // This information could help the game designers track the changing user engagement
    // as their set of games changes.
    userEvents.apply("WindowIntoSessions", Window.<KV<String, Integer>>into(Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))).withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)).apply(Combine.perKey(x -> 0)).apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())).apply("WindowToExtractSessionMean", Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))).apply(Mean.<Integer>globally().withoutDefaults()).apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite()));
    // [END DocInclude_Rewindow]
    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
}
Also used : KV(org.apache.beam.sdk.values.KV) DateTimeZone(org.joda.time.DateTimeZone) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) PipelineResult(org.apache.beam.sdk.PipelineResult) Default(org.apache.beam.sdk.options.Default) Combine(org.apache.beam.sdk.transforms.Combine) Duration(org.joda.time.Duration) LoggerFactory(org.slf4j.LoggerFactory) WriteWindowedToBigQuery(org.apache.beam.examples.complete.game.utils.WriteWindowedToBigQuery) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Metrics(org.apache.beam.sdk.metrics.Metrics) Description(org.apache.beam.sdk.options.Description) PTransform(org.apache.beam.sdk.transforms.PTransform) Sessions(org.apache.beam.sdk.transforms.windowing.Sessions) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) DateTimeFormat(org.joda.time.format.DateTimeFormat) Logger(org.slf4j.Logger) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) TimeZone(java.util.TimeZone) Counter(org.apache.beam.sdk.metrics.Counter) Sum(org.apache.beam.sdk.transforms.Sum) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) PCollection(org.apache.beam.sdk.values.PCollection) Mean(org.apache.beam.sdk.transforms.Mean) ExampleUtils(org.apache.beam.examples.common.ExampleUtils) PubsubIO(org.apache.beam.sdk.io.gcp.pubsub.PubsubIO) ParDo(org.apache.beam.sdk.transforms.ParDo) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Instant(org.joda.time.Instant) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Values(org.apache.beam.sdk.transforms.Values) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) ExampleUtils(org.apache.beam.examples.common.ExampleUtils) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Example 18 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class WriteFilesTest method testCustomShardStrategyDisplayData.

@Test
public void testCustomShardStrategyDisplayData() {
    SimpleSink sink = new SimpleSink(getBaseOutputDirectory(), "file", "-SS-of-NN", "") {

        @Override
        public void populateDisplayData(DisplayData.Builder builder) {
            builder.add(DisplayData.item("foo", "bar"));
        }
    };
    WriteFiles<String> write = WriteFiles.to(sink).withSharding(new PTransform<PCollection<String>, PCollectionView<Integer>>() {

        @Override
        public PCollectionView<Integer> expand(PCollection<String> input) {
            return null;
        }

        @Override
        public void populateDisplayData(DisplayData.Builder builder) {
            builder.add(DisplayData.item("spam", "ham"));
        }
    });
    DisplayData displayData = DisplayData.from(write);
    assertThat(displayData, hasDisplayItem("sink", sink.getClass()));
    assertThat(displayData, includesDisplayDataFor("sink", sink));
    assertThat(displayData, hasDisplayItem("spam", "ham"));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) Test(org.junit.Test)

Example 19 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class DoFnOperatorTest method testSideInputs.

public void testSideInputs(boolean keyed) throws Exception {
    WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder = WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
    TupleTag<String> outputTag = new TupleTag<>("main-output");
    ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
    Coder<String> keyCoder = null;
    if (keyed) {
        keyCoder = StringUtf8Coder.of();
    }
    DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<String>(), "stepName", windowedValueCoder, outputTag, Collections.<TupleTag<?>>emptyList(), new DoFnOperator.DefaultOutputManagerFactory<String>(), WindowingStrategy.globalDefault(), sideInputMapping, /* side-input mapping */
    ImmutableList.<PCollectionView<?>>of(view1, view2), /* side inputs */
    PipelineOptionsFactory.as(FlinkPipelineOptions.class), keyCoder);
    TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, String> testHarness = new TwoInputStreamOperatorTestHarness<>(doFnOperator);
    if (keyed) {
        // we use a dummy key for the second input since it is considered to be broadcast
        testHarness = new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, new StringKeySelector(), new DummyKeySelector(), BasicTypeInfo.STRING_TYPE_INFO);
    }
    testHarness.open();
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
    IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
    // test the keep of sideInputs events
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(ImmutableList.of("hello", "ciao"), new Instant(0), firstWindow))));
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(0), secondWindow))));
    // push in a regular elements
    WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
    WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
    testHarness.processElement1(new StreamRecord<>(helloElement));
    testHarness.processElement1(new StreamRecord<>(worldElement));
    // test the keep of pushed-back events
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(ImmutableList.of("hello", "ciao"), new Instant(1000), firstWindow))));
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(1000), secondWindow))));
    assertThat(this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(helloElement, worldElement));
    testHarness.close();
}
Also used : TwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) Instant(org.joda.time.Instant) TupleTag(org.apache.beam.sdk.values.TupleTag) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) DoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator) PCollectionView(org.apache.beam.sdk.values.PCollectionView) WindowedValue(org.apache.beam.sdk.util.WindowedValue) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 20 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class SideInputContainerTest method getReturnsLatestPaneInWindow.

@Test
public void getReturnsLatestPaneInWindow() throws Exception {
    WindowedValue<KV<String, Integer>> one = WindowedValue.of(KV.of("one", 1), new Instant(1L), SECOND_WINDOW, PaneInfo.createPane(true, false, Timing.EARLY));
    WindowedValue<KV<String, Integer>> two = WindowedValue.of(KV.of("two", 2), new Instant(20L), SECOND_WINDOW, PaneInfo.createPane(true, false, Timing.EARLY));
    container.write(mapView, ImmutableList.<WindowedValue<?>>of(one, two));
    Map<String, Integer> viewContents = container.createReaderForViews(ImmutableList.<PCollectionView<?>>of(mapView)).get(mapView, SECOND_WINDOW);
    assertThat(viewContents, hasEntry("one", 1));
    assertThat(viewContents, hasEntry("two", 2));
    assertThat(viewContents.size(), is(2));
    WindowedValue<KV<String, Integer>> three = WindowedValue.of(KV.of("three", 3), new Instant(300L), SECOND_WINDOW, PaneInfo.createPane(false, false, Timing.EARLY, 1, -1));
    container.write(mapView, ImmutableList.<WindowedValue<?>>of(three));
    Map<String, Integer> overwrittenViewContents = container.createReaderForViews(ImmutableList.<PCollectionView<?>>of(mapView)).get(mapView, SECOND_WINDOW);
    assertThat(overwrittenViewContents, hasEntry("three", 3));
    assertThat(overwrittenViewContents.size(), is(1));
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Aggregations

PCollectionView (org.apache.beam.sdk.values.PCollectionView)20 Test (org.junit.Test)12 Instant (org.joda.time.Instant)10 TupleTag (org.apache.beam.sdk.values.TupleTag)9 KV (org.apache.beam.sdk.values.KV)8 DoFn (org.apache.beam.sdk.transforms.DoFn)7 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)6 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)6 WindowedValue (org.apache.beam.sdk.util.WindowedValue)5 PCollection (org.apache.beam.sdk.values.PCollection)5 Pipeline (org.apache.beam.sdk.Pipeline)4 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)4 Duration (org.joda.time.Duration)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 List (java.util.List)2 TimeZone (java.util.TimeZone)2 FlinkPipelineOptions (org.apache.beam.runners.flink.FlinkPipelineOptions)2 DoFnOperator (org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator)2 PipelineResult (org.apache.beam.sdk.PipelineResult)2