Search in sources :

Example 21 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class NodesTest method testFetchReadySideInputsAndFilterBlockedStreamingSideInputsNode.

@Test
public void testFetchReadySideInputsAndFilterBlockedStreamingSideInputsNode() {
    WindowingStrategy windowingStrategy = WindowingStrategy.globalDefault();
    Map<PCollectionView<?>, RunnerApi.FunctionSpec> pcollectionViewsToWindowMappingFns = ImmutableMap.of(mock(PCollectionView.class), FunctionSpec.newBuilder().setUrn("beam:test:urn:1.0").build());
    NameContext nameContext = NameContextsForTests.nameContextForTest();
    assertSame(FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pcollectionViewsToWindowMappingFns, nameContext).getWindowingStrategy(), windowingStrategy);
    assertSame(FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pcollectionViewsToWindowMappingFns, nameContext).getPCollectionViewsToWindowMappingFns(), pcollectionViewsToWindowMappingFns);
    assertSame(FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pcollectionViewsToWindowMappingFns, nameContext).getNameContext(), nameContext);
}
Also used : DataflowPortabilityPCollectionView(org.apache.beam.runners.dataflow.worker.DataflowPortabilityPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) NameContext(org.apache.beam.runners.dataflow.worker.counters.NameContext) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) Test(org.junit.Test)

Example 22 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class DataflowSideInputHandlerFactory method forMultimapSideInput.

@Override
public <K, V, W extends BoundedWindow> MultimapSideInputHandler<K, V, W> forMultimapSideInput(String pTransformId, String sideInputId, KvCoder<K, V> elementCoder, Coder<W> windowCoder) {
    checkArgument(pTransformId != null && pTransformId.length() > 0, "Expect a valid PTransform ID.");
    SideInputReader sideInputReader = ptransformIdToSideInputReader.get(pTransformId);
    checkState(sideInputReader != null, String.format("Unknown PTransform '%s'", pTransformId));
    PCollectionView<Materializations.MultimapView<Object, Object>> view = (PCollectionView<Materializations.MultimapView<Object, Object>>) sideInputIdToPCollectionViewMap.get(RunnerApi.ExecutableStagePayload.SideInputId.newBuilder().setTransformId(pTransformId).setLocalName(sideInputId).build());
    checkState(view != null, String.format("Unknown side input '%s' on PTransform '%s'", sideInputId, pTransformId));
    checkState(Materializations.MULTIMAP_MATERIALIZATION_URN.equals(view.getViewFn().getMaterialization().getUrn()), String.format("Unknown materialization for side input '%s' on PTransform '%s' with urn '%s'", sideInputId, pTransformId, view.getViewFn().getMaterialization().getUrn()));
    checkState(view.getCoderInternal() instanceof KvCoder, String.format("Materialization of side input '%s' on PTransform '%s' expects %s but received %s.", sideInputId, pTransformId, KvCoder.class.getSimpleName(), view.getCoderInternal().getClass().getSimpleName()));
    KvCoder<K, V> kvCoder = elementCoder;
    return new DataflowMultimapSideInputHandler<>(sideInputReader, view, kvCoder.getKeyCoder(), kvCoder.getValueCoder(), windowCoder);
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) KvCoder(org.apache.beam.sdk.coders.KvCoder) SideInputReader(org.apache.beam.runners.core.SideInputReader) Materializations(org.apache.beam.sdk.transforms.Materializations)

Example 23 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class SamzaDoFnRunners method createPortable.

/**
 * Create DoFnRunner for portable runner.
 */
@SuppressWarnings("unchecked")
public static <InT, FnOutT> DoFnRunner<InT, FnOutT> createPortable(String transformId, String bundleStateId, Coder<WindowedValue<InT>> windowedValueCoder, ExecutableStage executableStage, Map<?, PCollectionView<?>> sideInputMapping, SideInputHandler sideInputHandler, SamzaStoreStateInternals.Factory<?> nonKeyedStateInternalsFactory, SamzaTimerInternalsFactory<?> timerInternalsFactory, SamzaPipelineOptions pipelineOptions, DoFnRunners.OutputManager outputManager, StageBundleFactory stageBundleFactory, TupleTag<FnOutT> mainOutputTag, Map<String, TupleTag<?>> idToTupleTagMap, Context context, String transformFullName) {
    // storing events within a bundle in states
    final BagState<WindowedValue<InT>> bundledEventsBag = nonKeyedStateInternalsFactory.stateInternalsForKey(null).state(StateNamespaces.global(), StateTags.bag(bundleStateId, windowedValueCoder));
    final StateRequestHandler stateRequestHandler = SamzaStateRequestHandlers.of(transformId, context.getTaskContext(), pipelineOptions, executableStage, stageBundleFactory, (Map<RunnerApi.ExecutableStagePayload.SideInputId, PCollectionView<?>>) sideInputMapping, sideInputHandler);
    final SamzaExecutionContext executionContext = (SamzaExecutionContext) context.getApplicationContainerContext();
    final DoFnRunner<InT, FnOutT> underlyingRunner = new SdkHarnessDoFnRunner<>(timerInternalsFactory, WindowUtils.getWindowStrategy(executableStage.getInputPCollection().getId(), executableStage.getComponents()), outputManager, stageBundleFactory, idToTupleTagMap, bundledEventsBag, stateRequestHandler);
    return pipelineOptions.getEnableMetrics() ? DoFnRunnerWithMetrics.wrap(underlyingRunner, executionContext.getMetricsContainer(), transformFullName) : underlyingRunner;
}
Also used : SamzaExecutionContext(org.apache.beam.runners.samza.SamzaExecutionContext) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) StateRequestHandler(org.apache.beam.runners.fnexecution.state.StateRequestHandler) PCollectionView(org.apache.beam.sdk.values.PCollectionView) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Example 24 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class DoFnOperatorTest method keyedParDoSideInputCheckpointing.

@Test
public void keyedParDoSideInputCheckpointing() throws Exception {
    sideInputCheckpointing(() -> {
        StringUtf8Coder keyCoder = StringUtf8Coder.of();
        Coder<WindowedValue<String>> coder = WindowedValue.getFullCoder(keyCoder, IntervalWindow.getCoder());
        TupleTag<String> outputTag = new TupleTag<>("main-output");
        KeySelector<WindowedValue<String>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue(), keyCoder);
        ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
        DoFnOperator<String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.of(FixedWindows.of(Duration.millis(100))), sideInputMapping, /* side-input mapping */
        ImmutableList.of(view1, view2), /* side inputs */
        FlinkPipelineOptions.defaults(), keyCoder, keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
        return new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, keySelector, // we use a dummy key for the second input since it is considered to be broadcast
        null, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
    });
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) Arrays(java.util.Arrays) StateNamespace(org.apache.beam.runners.core.StateNamespace) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) IsIterableContainingInOrder.contains(org.hamcrest.collection.IsIterableContainingInOrder.contains) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) TimerSpecs(org.apache.beam.sdk.state.TimerSpecs) DoFnRunner(org.apache.beam.runners.core.DoFnRunner) FlinkMetricContainer(org.apache.beam.runners.flink.metrics.FlinkMetricContainer) StepContext(org.apache.beam.runners.core.StepContext) ValueState(org.apache.beam.sdk.state.ValueState) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) TimerInternals(org.apache.beam.runners.core.TimerInternals) ByteBuffer(java.nio.ByteBuffer) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) TypeFactory(com.fasterxml.jackson.databind.type.TypeFactory) Create(org.apache.beam.sdk.transforms.Create) TwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) LRUMap(com.fasterxml.jackson.databind.util.LRUMap) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FluentIterable(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.FluentIterable) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) KvCoder(org.apache.beam.sdk.coders.KvCoder) KeySelector(org.apache.flink.api.java.functions.KeySelector) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) OutputTag(org.apache.flink.util.OutputTag) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Collectors(java.util.stream.Collectors) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) Objects(java.util.Objects) List(java.util.List) WatermarkHoldState(org.apache.beam.sdk.state.WatermarkHoldState) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Timer(org.apache.beam.sdk.state.Timer) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Optional(java.util.Optional) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) StateTag(org.apache.beam.runners.core.StateTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) Whitebox(org.powermock.reflect.Whitebox) KV(org.apache.beam.sdk.values.KV) Assert.assertThrows(org.junit.Assert.assertThrows) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) StateNamespaces(org.apache.beam.runners.core.StateNamespaces) Supplier(java.util.function.Supplier) StateTags(org.apache.beam.runners.core.StateTags) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) TimerSpec(org.apache.beam.sdk.state.TimerSpec) CoderTypeSerializer(org.apache.beam.runners.flink.translation.types.CoderTypeSerializer) TupleTag(org.apache.beam.sdk.values.TupleTag) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Pipeline(org.apache.beam.sdk.Pipeline) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Before(org.junit.Before) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionViewTesting(org.apache.beam.sdk.testing.PCollectionViewTesting) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Function(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Function) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Mockito(org.mockito.Mockito) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) StateSpecs(org.apache.beam.sdk.state.StateSpecs) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Collections(java.util.Collections) TimeDomain(org.apache.beam.sdk.state.TimeDomain) Assert.assertEquals(org.junit.Assert.assertEquals) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) TupleTag(org.apache.beam.sdk.values.TupleTag) ByteBuffer(java.nio.ByteBuffer) PCollectionView(org.apache.beam.sdk.values.PCollectionView) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) Test(org.junit.Test)

Example 25 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class GameStats method main.

public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);
    // Read Events from Pub/Sub using custom timestamps
    PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())).apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
    // Extract username/score pairs from the event stream
    PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())).via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
    // Calculate the total score per user over fixed windows, and
    // cumulative updates for late data.
    final PCollectionView<Map<String, Integer>> spammersView = userEvents.apply("FixedWindowsUser", Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("CalculateSpammyUsers", new CalculateSpammyUsers()).apply("CreateSpammersView", View.<String, Integer>asMap());
    // [START DocInclude_FilterAndCalc]
    // Calculate the total score per team over fixed windows,
    // and emit cumulative updates for late data. Uses the side input derived above-- the set of
    // suspected robots-- to filter out scores from those users from the sum.
    // Write the results to BigQuery.
    rawEvents.apply("WindowIntoFixedWindows", Window.<GameActionInfo>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            // If the user is not in the spammers Map, output the data element.
            if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
                c.output(c.element());
            }
        }
    }).withSideInputs(spammersView)).apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite()));
    // [START DocInclude_SessionCalc]
    // Detect user sessions-- that is, a burst of activity separated by a gap from further
    // activity. Find and record the mean session lengths.
    // This information could help the game designers track the changing user engagement
    // as their set of games changes.
    userEvents.apply("WindowIntoSessions", Window.<KV<String, Integer>>into(Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))).withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)).apply(Combine.perKey(x -> 0)).apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())).apply("WindowToExtractSessionMean", Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))).apply(Mean.<Integer>globally().withoutDefaults()).apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite()));
    // [END DocInclude_Rewindow]
    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
}
Also used : KV(org.apache.beam.sdk.values.KV) DateTimeZone(org.joda.time.DateTimeZone) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) PipelineResult(org.apache.beam.sdk.PipelineResult) Default(org.apache.beam.sdk.options.Default) Combine(org.apache.beam.sdk.transforms.Combine) Duration(org.joda.time.Duration) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Metrics(org.apache.beam.sdk.metrics.Metrics) Description(org.apache.beam.sdk.options.Description) PTransform(org.apache.beam.sdk.transforms.PTransform) Sessions(org.apache.beam.sdk.transforms.windowing.Sessions) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) WriteWindowedToBigQuery(com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) DateTimeFormat(org.joda.time.format.DateTimeFormat) Logger(org.slf4j.Logger) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) TimeZone(java.util.TimeZone) Counter(org.apache.beam.sdk.metrics.Counter) Sum(org.apache.beam.sdk.transforms.Sum) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) PCollection(org.apache.beam.sdk.values.PCollection) Mean(org.apache.beam.sdk.transforms.Mean) ExampleUtils(com.google.cloud.dataflow.examples.common.ExampleUtils) PubsubIO(org.apache.beam.sdk.io.gcp.pubsub.PubsubIO) ParDo(org.apache.beam.sdk.transforms.ParDo) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Instant(org.joda.time.Instant) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Values(org.apache.beam.sdk.transforms.Values) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) ExampleUtils(com.google.cloud.dataflow.examples.common.ExampleUtils) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

PCollectionView (org.apache.beam.sdk.values.PCollectionView)67 Map (java.util.Map)29 HashMap (java.util.HashMap)28 Test (org.junit.Test)28 TupleTag (org.apache.beam.sdk.values.TupleTag)27 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 Coder (org.apache.beam.sdk.coders.Coder)21 KV (org.apache.beam.sdk.values.KV)20 Instant (org.joda.time.Instant)20 KvCoder (org.apache.beam.sdk.coders.KvCoder)18 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 PCollection (org.apache.beam.sdk.values.PCollection)18 DoFn (org.apache.beam.sdk.transforms.DoFn)16 ArrayList (java.util.ArrayList)15 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)14 List (java.util.List)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)13 IOException (java.io.IOException)12 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)12 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)10