use of org.apache.beam.sdk.values.PCollectionView in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class GameStats method main.
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
// Enforce that this pipeline is always run in streaming mode.
options.setStreaming(true);
ExampleUtils exampleUtils = new ExampleUtils(options);
Pipeline pipeline = Pipeline.create(options);
// Read Events from Pub/Sub using custom timestamps
PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())).apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
// Extract username/score pairs from the event stream
PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())).via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
// Calculate the total score per user over fixed windows, and
// cumulative updates for late data.
final PCollectionView<Map<String, Integer>> spammersView = userEvents.apply("FixedWindowsUser", Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("CalculateSpammyUsers", new CalculateSpammyUsers()).apply("CreateSpammersView", View.<String, Integer>asMap());
// [START DocInclude_FilterAndCalc]
// Calculate the total score per team over fixed windows,
// and emit cumulative updates for late data. Uses the side input derived above-- the set of
// suspected robots-- to filter out scores from those users from the sum.
// Write the results to BigQuery.
rawEvents.apply("WindowIntoFixedWindows", Window.<GameActionInfo>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() {
@ProcessElement
public void processElement(ProcessContext c) {
// If the user is not in the spammers Map, output the data element.
if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
c.output(c.element());
}
}
}).withSideInputs(spammersView)).apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite()));
// [START DocInclude_SessionCalc]
// Detect user sessions-- that is, a burst of activity separated by a gap from further
// activity. Find and record the mean session lengths.
// This information could help the game designers track the changing user engagement
// as their set of games changes.
userEvents.apply("WindowIntoSessions", Window.<KV<String, Integer>>into(Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))).withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)).apply(Combine.perKey(x -> 0)).apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())).apply("WindowToExtractSessionMean", Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))).apply(Mean.<Integer>globally().withoutDefaults()).apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite()));
// [END DocInclude_Rewindow]
// Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
// command line.
PipelineResult result = pipeline.run();
exampleUtils.waitToFinish(result);
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class WriteFiles method createWrite.
/**
* A write is performed as sequence of three {@link ParDo}'s.
*
* <p>This singleton collection containing the WriteOperation is then used as a side
* input to a ParDo over the PCollection of elements to write. In this bundle-writing phase,
* {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
* {@link Writer#open} and {@link Writer#close} are called in
* {@link DoFn.StartBundle} and {@link DoFn.FinishBundle}, respectively, and
* {@link Writer#write} method is called for every element in the bundle. The output
* of this ParDo is a PCollection of <i>writer result</i> objects (see {@link FileBasedSink}
* for a description of writer results)-one for each bundle.
*
* <p>The final do-once ParDo uses a singleton collection asinput and the collection of writer
* results as a side-input. In this ParDo, {@link WriteOperation#finalize} is called
* to finalize the write.
*
* <p>If the write of any element in the PCollection fails, {@link Writer#close} will be
* called before the exception that caused the write to fail is propagated and the write result
* will be discarded.
*
* <p>Since the {@link WriteOperation} is serialized after the initialization ParDo and
* deserialized in the bundle-writing and finalization phases, any state change to the
* WriteOperation object that occurs during initialization is visible in the latter
* phases. However, the WriteOperation is not serialized after the bundle-writing
* phase. This is why implementations should guarantee that
* {@link WriteOperation#createWriter} does not mutate WriteOperation).
*/
private PDone createWrite(PCollection<T> input) {
Pipeline p = input.getPipeline();
if (!windowedWrites) {
// Re-window the data into the global window and remove any existing triggers.
input = input.apply(Window.<T>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
}
// Perform the per-bundle writes as a ParDo on the input PCollection (with the
// WriteOperation as a side input) and collect the results of the writes in a
// PCollection. There is a dependency between this ParDo and the first (the
// WriteOperation PCollection as a side input), so this will happen after the
// initial ParDo.
PCollection<FileResult> results;
final PCollectionView<Integer> numShardsView;
Coder<BoundedWindow> shardedWindowCoder = (Coder<BoundedWindow>) input.getWindowingStrategy().getWindowFn().windowCoder();
if (computeNumShards == null && numShardsProvider == null) {
numShardsView = null;
results = input.apply("WriteBundles", ParDo.of(windowedWrites ? new WriteWindowedBundles() : new WriteUnwindowedBundles()));
} else {
List<PCollectionView<?>> sideInputs = Lists.newArrayList();
if (computeNumShards != null) {
numShardsView = input.apply(computeNumShards);
sideInputs.add(numShardsView);
} else {
numShardsView = null;
}
PCollection<KV<Integer, Iterable<T>>> sharded = input.apply("ApplyShardLabel", ParDo.of(new ApplyShardingKey<T>(numShardsView, (numShardsView != null) ? null : numShardsProvider)).withSideInputs(sideInputs)).apply("GroupIntoShards", GroupByKey.<Integer, T>create());
shardedWindowCoder = (Coder<BoundedWindow>) sharded.getWindowingStrategy().getWindowFn().windowCoder();
results = sharded.apply("WriteShardedBundles", ParDo.of(new WriteShardedBundles()));
}
results.setCoder(FileResultCoder.of(shardedWindowCoder));
if (windowedWrites) {
// When processing streaming windowed writes, results will arrive multiple times. This
// means we can't share the below implementation that turns the results into a side input,
// as new data arriving into a side input does not trigger the listening DoFn. Instead
// we aggregate the result set using a singleton GroupByKey, so the DoFn will be triggered
// whenever new data arrives.
PCollection<KV<Void, FileResult>> keyedResults = results.apply("AttachSingletonKey", WithKeys.<Void, FileResult>of((Void) null));
keyedResults.setCoder(KvCoder.of(VoidCoder.of(), FileResultCoder.of(shardedWindowCoder)));
// Is the continuation trigger sufficient?
keyedResults.apply("FinalizeGroupByKey", GroupByKey.<Void, FileResult>create()).apply("Finalize", ParDo.of(new DoFn<KV<Void, Iterable<FileResult>>, Integer>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
LOG.info("Finalizing write operation {}.", writeOperation);
List<FileResult> results = Lists.newArrayList(c.element().getValue());
writeOperation.finalize(results);
LOG.debug("Done finalizing write operation");
}
}));
} else {
final PCollectionView<Iterable<FileResult>> resultsView = results.apply(View.<FileResult>asIterable());
ImmutableList.Builder<PCollectionView<?>> sideInputs = ImmutableList.<PCollectionView<?>>builder().add(resultsView);
if (numShardsView != null) {
sideInputs.add(numShardsView);
}
// Finalize the write in another do-once ParDo on the singleton collection containing the
// Writer. The results from the per-bundle writes are given as an Iterable side input.
// The WriteOperation's state is the same as after its initialization in the first
// do-once ParDo. There is a dependency between this ParDo and the parallel write (the writer
// results collection as a side input), so it will happen after the parallel write.
// For the non-windowed case, we guarantee that if no data is written but the user has
// set numShards, then all shards will be written out as empty files. For this reason we
// use a side input here.
PCollection<Void> singletonCollection = p.apply(Create.of((Void) null));
singletonCollection.apply("Finalize", ParDo.of(new DoFn<Void, Integer>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
LOG.info("Finalizing write operation {}.", writeOperation);
List<FileResult> results = Lists.newArrayList(c.sideInput(resultsView));
LOG.debug("Side input initialized to finalize write operation {}.", writeOperation);
// We must always output at least 1 shard, and honor user-specified numShards if
// set.
int minShardsNeeded;
if (numShardsView != null) {
minShardsNeeded = c.sideInput(numShardsView);
} else if (numShardsProvider != null) {
minShardsNeeded = numShardsProvider.get();
} else {
minShardsNeeded = 1;
}
int extraShardsNeeded = minShardsNeeded - results.size();
if (extraShardsNeeded > 0) {
LOG.info("Creating {} empty output shards in addition to {} written for a total of {}.", extraShardsNeeded, results.size(), minShardsNeeded);
for (int i = 0; i < extraShardsNeeded; ++i) {
Writer<T> writer = writeOperation.createWriter();
writer.openUnwindowed(UUID.randomUUID().toString(), UNKNOWN_SHARDNUM);
FileResult emptyWrite = writer.close();
results.add(emptyWrite);
}
LOG.debug("Done creating extra shards.");
}
writeOperation.finalize(results);
LOG.debug("Done finalizing write operation {}", writeOperation);
}
}).withSideInputs(sideInputs.build()));
}
return PDone.in(input.getPipeline());
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class SimplePushbackSideInputDoFnRunnerTest method processElementSideInputReadyAllWindows.
@Test
public void processElementSideInputReadyAllWindows() {
when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class))).thenReturn(true);
ImmutableList<PCollectionView<?>> views = ImmutableList.<PCollectionView<?>>of(singletonView);
SimplePushbackSideInputDoFnRunner<Integer, Integer> runner = createRunner(views);
WindowedValue<Integer> multiWindow = WindowedValue.of(2, new Instant(-2), ImmutableList.of(new IntervalWindow(new Instant(-500L), new Instant(0L)), new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)), GlobalWindow.INSTANCE), PaneInfo.ON_TIME_AND_ONLY_FIRING);
Iterable<WindowedValue<Integer>> multiWindowPushback = runner.processElementInReadyWindows(multiWindow);
assertThat(multiWindowPushback, emptyIterable());
assertThat(underlying.inputElems, containsInAnyOrder(ImmutableList.copyOf(multiWindow.explodeWindows()).toArray()));
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class ParDoTest method testParDoWithTaggedOutputName.
@Test
public void testParDoWithTaggedOutputName() {
pipeline.enableAbandonedNodeEnforcement(false);
TupleTag<String> mainOutputTag = new TupleTag<String>("main") {
};
TupleTag<String> additionalOutputTag1 = new TupleTag<String>("output1") {
};
TupleTag<String> additionalOutputTag2 = new TupleTag<String>("output2") {
};
TupleTag<String> additionalOutputTag3 = new TupleTag<String>("output3") {
};
TupleTag<String> additionalOutputTagUnwritten = new TupleTag<String>("unwrittenOutput") {
};
PCollectionTuple outputs = pipeline.apply(Create.of(Arrays.asList(3, -42, 666))).setName("MyInput").apply("MyParDo", ParDo.of(new TestDoFn(Arrays.<PCollectionView<Integer>>asList(), Arrays.asList(additionalOutputTag1, additionalOutputTag2, additionalOutputTag3))).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag3).and(additionalOutputTag1).and(additionalOutputTagUnwritten).and(additionalOutputTag2)));
assertEquals("MyParDo.main", outputs.get(mainOutputTag).getName());
assertEquals("MyParDo.output1", outputs.get(additionalOutputTag1).getName());
assertEquals("MyParDo.output2", outputs.get(additionalOutputTag2).getName());
assertEquals("MyParDo.output3", outputs.get(additionalOutputTag3).getName());
assertEquals("MyParDo.unwrittenOutput", outputs.get(additionalOutputTagUnwritten).getName());
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class ParDoTest method testParDoWithTaggedOutput.
@Test
@Category(ValidatesRunner.class)
public void testParDoWithTaggedOutput() {
List<Integer> inputs = Arrays.asList(3, -42, 666);
TupleTag<String> mainOutputTag = new TupleTag<String>("main") {
};
TupleTag<String> additionalOutputTag1 = new TupleTag<String>("additional1") {
};
TupleTag<String> additionalOutputTag2 = new TupleTag<String>("additional2") {
};
TupleTag<String> additionalOutputTag3 = new TupleTag<String>("additional3") {
};
TupleTag<String> additionalOutputTagUnwritten = new TupleTag<String>("unwrittenOutput") {
};
PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestDoFn(Arrays.<PCollectionView<Integer>>asList(), Arrays.asList(additionalOutputTag1, additionalOutputTag2, additionalOutputTag3))).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag3).and(additionalOutputTag1).and(additionalOutputTagUnwritten).and(additionalOutputTag2)));
PAssert.that(outputs.get(mainOutputTag)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs));
PAssert.that(outputs.get(additionalOutputTag1)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag1));
PAssert.that(outputs.get(additionalOutputTag2)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag2));
PAssert.that(outputs.get(additionalOutputTag3)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag3));
PAssert.that(outputs.get(additionalOutputTagUnwritten)).empty();
pipeline.run();
}
Aggregations