use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class NodesTest method testFetchReadySideInputsAndFilterBlockedStreamingSideInputsNode.
@Test
public void testFetchReadySideInputsAndFilterBlockedStreamingSideInputsNode() {
WindowingStrategy windowingStrategy = WindowingStrategy.globalDefault();
Map<PCollectionView<?>, RunnerApi.FunctionSpec> pcollectionViewsToWindowMappingFns = ImmutableMap.of(mock(PCollectionView.class), FunctionSpec.newBuilder().setUrn("beam:test:urn:1.0").build());
NameContext nameContext = NameContextsForTests.nameContextForTest();
assertSame(FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pcollectionViewsToWindowMappingFns, nameContext).getWindowingStrategy(), windowingStrategy);
assertSame(FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pcollectionViewsToWindowMappingFns, nameContext).getPCollectionViewsToWindowMappingFns(), pcollectionViewsToWindowMappingFns);
assertSame(FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pcollectionViewsToWindowMappingFns, nameContext).getNameContext(), nameContext);
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class DataflowSideInputHandlerFactory method forMultimapSideInput.
@Override
public <K, V, W extends BoundedWindow> MultimapSideInputHandler<K, V, W> forMultimapSideInput(String pTransformId, String sideInputId, KvCoder<K, V> elementCoder, Coder<W> windowCoder) {
checkArgument(pTransformId != null && pTransformId.length() > 0, "Expect a valid PTransform ID.");
SideInputReader sideInputReader = ptransformIdToSideInputReader.get(pTransformId);
checkState(sideInputReader != null, String.format("Unknown PTransform '%s'", pTransformId));
PCollectionView<Materializations.MultimapView<Object, Object>> view = (PCollectionView<Materializations.MultimapView<Object, Object>>) sideInputIdToPCollectionViewMap.get(RunnerApi.ExecutableStagePayload.SideInputId.newBuilder().setTransformId(pTransformId).setLocalName(sideInputId).build());
checkState(view != null, String.format("Unknown side input '%s' on PTransform '%s'", sideInputId, pTransformId));
checkState(Materializations.MULTIMAP_MATERIALIZATION_URN.equals(view.getViewFn().getMaterialization().getUrn()), String.format("Unknown materialization for side input '%s' on PTransform '%s' with urn '%s'", sideInputId, pTransformId, view.getViewFn().getMaterialization().getUrn()));
checkState(view.getCoderInternal() instanceof KvCoder, String.format("Materialization of side input '%s' on PTransform '%s' expects %s but received %s.", sideInputId, pTransformId, KvCoder.class.getSimpleName(), view.getCoderInternal().getClass().getSimpleName()));
KvCoder<K, V> kvCoder = elementCoder;
return new DataflowMultimapSideInputHandler<>(sideInputReader, view, kvCoder.getKeyCoder(), kvCoder.getValueCoder(), windowCoder);
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class SamzaDoFnRunners method createPortable.
/**
* Create DoFnRunner for portable runner.
*/
@SuppressWarnings("unchecked")
public static <InT, FnOutT> DoFnRunner<InT, FnOutT> createPortable(String transformId, String bundleStateId, Coder<WindowedValue<InT>> windowedValueCoder, ExecutableStage executableStage, Map<?, PCollectionView<?>> sideInputMapping, SideInputHandler sideInputHandler, SamzaStoreStateInternals.Factory<?> nonKeyedStateInternalsFactory, SamzaTimerInternalsFactory<?> timerInternalsFactory, SamzaPipelineOptions pipelineOptions, DoFnRunners.OutputManager outputManager, StageBundleFactory stageBundleFactory, TupleTag<FnOutT> mainOutputTag, Map<String, TupleTag<?>> idToTupleTagMap, Context context, String transformFullName) {
// storing events within a bundle in states
final BagState<WindowedValue<InT>> bundledEventsBag = nonKeyedStateInternalsFactory.stateInternalsForKey(null).state(StateNamespaces.global(), StateTags.bag(bundleStateId, windowedValueCoder));
final StateRequestHandler stateRequestHandler = SamzaStateRequestHandlers.of(transformId, context.getTaskContext(), pipelineOptions, executableStage, stageBundleFactory, (Map<RunnerApi.ExecutableStagePayload.SideInputId, PCollectionView<?>>) sideInputMapping, sideInputHandler);
final SamzaExecutionContext executionContext = (SamzaExecutionContext) context.getApplicationContainerContext();
final DoFnRunner<InT, FnOutT> underlyingRunner = new SdkHarnessDoFnRunner<>(timerInternalsFactory, WindowUtils.getWindowStrategy(executableStage.getInputPCollection().getId(), executableStage.getComponents()), outputManager, stageBundleFactory, idToTupleTagMap, bundledEventsBag, stateRequestHandler);
return pipelineOptions.getEnableMetrics() ? DoFnRunnerWithMetrics.wrap(underlyingRunner, executionContext.getMetricsContainer(), transformFullName) : underlyingRunner;
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class DoFnOperatorTest method keyedParDoSideInputCheckpointing.
@Test
public void keyedParDoSideInputCheckpointing() throws Exception {
sideInputCheckpointing(() -> {
StringUtf8Coder keyCoder = StringUtf8Coder.of();
Coder<WindowedValue<String>> coder = WindowedValue.getFullCoder(keyCoder, IntervalWindow.getCoder());
TupleTag<String> outputTag = new TupleTag<>("main-output");
KeySelector<WindowedValue<String>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue(), keyCoder);
ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
DoFnOperator<String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.of(FixedWindows.of(Duration.millis(100))), sideInputMapping, /* side-input mapping */
ImmutableList.of(view1, view2), /* side inputs */
FlinkPipelineOptions.defaults(), keyCoder, keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
return new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, keySelector, // we use a dummy key for the second input since it is considered to be broadcast
null, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
});
}
use of org.apache.beam.sdk.values.PCollectionView in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class GameStats method main.
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
// Enforce that this pipeline is always run in streaming mode.
options.setStreaming(true);
ExampleUtils exampleUtils = new ExampleUtils(options);
Pipeline pipeline = Pipeline.create(options);
// Read Events from Pub/Sub using custom timestamps
PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())).apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
// Extract username/score pairs from the event stream
PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())).via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
// Calculate the total score per user over fixed windows, and
// cumulative updates for late data.
final PCollectionView<Map<String, Integer>> spammersView = userEvents.apply("FixedWindowsUser", Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("CalculateSpammyUsers", new CalculateSpammyUsers()).apply("CreateSpammersView", View.<String, Integer>asMap());
// [START DocInclude_FilterAndCalc]
// Calculate the total score per team over fixed windows,
// and emit cumulative updates for late data. Uses the side input derived above-- the set of
// suspected robots-- to filter out scores from those users from the sum.
// Write the results to BigQuery.
rawEvents.apply("WindowIntoFixedWindows", Window.<GameActionInfo>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() {
@ProcessElement
public void processElement(ProcessContext c) {
// If the user is not in the spammers Map, output the data element.
if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
c.output(c.element());
}
}
}).withSideInputs(spammersView)).apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite()));
// [START DocInclude_SessionCalc]
// Detect user sessions-- that is, a burst of activity separated by a gap from further
// activity. Find and record the mean session lengths.
// This information could help the game designers track the changing user engagement
// as their set of games changes.
userEvents.apply("WindowIntoSessions", Window.<KV<String, Integer>>into(Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))).withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)).apply(Combine.perKey(x -> 0)).apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())).apply("WindowToExtractSessionMean", Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))).apply(Mean.<Integer>globally().withoutDefaults()).apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite()));
// [END DocInclude_Rewindow]
// Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
// command line.
PipelineResult result = pipeline.run();
exampleUtils.waitToFinish(result);
}
Aggregations