use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class LeaderBoardTest method testTeamScoresUnobservablyLate.
/**
* A test where elements arrive behind the watermark (late data), but before the end of the
* window. These elements are emitted on time.
*/
@Test
public void testTeamScoresUnobservablyLate() {
BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
TestStream<GameActionInfo> createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)).advanceWatermarkTo(baseTime).addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8)), event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))).advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION).minus(Duration.standardMinutes(1))).addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), event(TestUser.BLUE_TWO, 2, Duration.standardSeconds(90)), event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))).advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))).advanceWatermarkToInfinity();
PCollection<KV<String, Integer>> teamScores = p.apply(createEvents).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
String blueTeam = TestUser.BLUE_ONE.getTeam();
String redTeam = TestUser.RED_ONE.getTeam();
// The On Time pane contains the late elements that arrived before the end of the window
PAssert.that(teamScores).inOnTimePane(window).containsInAnyOrder(KV.of(redTeam, 14), KV.of(blueTeam, 13));
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class LeaderBoardTest method testTeamScoresObservablyLate.
/**
* A test where elements arrive behind the watermark (late data) after the watermark passes the
* end of the window, but before the maximum allowed lateness. These elements are emitted in a
* late pane.
*/
@Test
public void testTeamScoresObservablyLate() {
Instant firstWindowCloses = baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION);
TestStream<GameActionInfo> createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)).advanceWatermarkTo(baseTime).addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8))).advanceProcessingTime(Duration.standardMinutes(10)).advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))).addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(1)), event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))).advanceWatermarkTo(firstWindowCloses.minus(Duration.standardMinutes(1))).addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))).advanceProcessingTime(Duration.standardMinutes(12)).addElements(event(TestUser.RED_TWO, 9, Duration.standardMinutes(1)), event(TestUser.RED_TWO, 1, Duration.standardMinutes(3))).advanceWatermarkToInfinity();
PCollection<KV<String, Integer>> teamScores = p.apply(createEvents).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
String blueTeam = TestUser.BLUE_ONE.getTeam();
String redTeam = TestUser.RED_ONE.getTeam();
PAssert.that(teamScores).inWindow(window).satisfies((SerializableFunction<Iterable<KV<String, Integer>>, Void>) input -> {
assertThat(input, hasItem(KV.of(blueTeam, 11)));
assertThat(input, hasItem(KV.of(redTeam, 27)));
return null;
});
PAssert.thatMap(teamScores).inOnTimePane(window).isEqualTo(ImmutableMap.<String, Integer>builder().put(redTeam, 7).put(blueTeam, 11).build());
// No final pane is emitted for the blue team, as all of their updates have been taken into
// account in earlier panes
PAssert.that(teamScores).inFinalPane(window).containsInAnyOrder(KV.of(redTeam, 27));
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.
the class WriteFiles method createWrite.
/**
* A write is performed as sequence of three {@link ParDo}'s.
*
* <p>This singleton collection containing the WriteOperation is then used as a side
* input to a ParDo over the PCollection of elements to write. In this bundle-writing phase,
* {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
* {@link Writer#open} and {@link Writer#close} are called in
* {@link DoFn.StartBundle} and {@link DoFn.FinishBundle}, respectively, and
* {@link Writer#write} method is called for every element in the bundle. The output
* of this ParDo is a PCollection of <i>writer result</i> objects (see {@link FileBasedSink}
* for a description of writer results)-one for each bundle.
*
* <p>The final do-once ParDo uses a singleton collection asinput and the collection of writer
* results as a side-input. In this ParDo, {@link WriteOperation#finalize} is called
* to finalize the write.
*
* <p>If the write of any element in the PCollection fails, {@link Writer#close} will be
* called before the exception that caused the write to fail is propagated and the write result
* will be discarded.
*
* <p>Since the {@link WriteOperation} is serialized after the initialization ParDo and
* deserialized in the bundle-writing and finalization phases, any state change to the
* WriteOperation object that occurs during initialization is visible in the latter
* phases. However, the WriteOperation is not serialized after the bundle-writing
* phase. This is why implementations should guarantee that
* {@link WriteOperation#createWriter} does not mutate WriteOperation).
*/
private PDone createWrite(PCollection<T> input) {
Pipeline p = input.getPipeline();
if (!windowedWrites) {
// Re-window the data into the global window and remove any existing triggers.
input = input.apply(Window.<T>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
}
// Perform the per-bundle writes as a ParDo on the input PCollection (with the
// WriteOperation as a side input) and collect the results of the writes in a
// PCollection. There is a dependency between this ParDo and the first (the
// WriteOperation PCollection as a side input), so this will happen after the
// initial ParDo.
PCollection<FileResult> results;
final PCollectionView<Integer> numShardsView;
Coder<BoundedWindow> shardedWindowCoder = (Coder<BoundedWindow>) input.getWindowingStrategy().getWindowFn().windowCoder();
if (computeNumShards == null && numShardsProvider == null) {
numShardsView = null;
results = input.apply("WriteBundles", ParDo.of(windowedWrites ? new WriteWindowedBundles() : new WriteUnwindowedBundles()));
} else {
List<PCollectionView<?>> sideInputs = Lists.newArrayList();
if (computeNumShards != null) {
numShardsView = input.apply(computeNumShards);
sideInputs.add(numShardsView);
} else {
numShardsView = null;
}
PCollection<KV<Integer, Iterable<T>>> sharded = input.apply("ApplyShardLabel", ParDo.of(new ApplyShardingKey<T>(numShardsView, (numShardsView != null) ? null : numShardsProvider)).withSideInputs(sideInputs)).apply("GroupIntoShards", GroupByKey.<Integer, T>create());
shardedWindowCoder = (Coder<BoundedWindow>) sharded.getWindowingStrategy().getWindowFn().windowCoder();
results = sharded.apply("WriteShardedBundles", ParDo.of(new WriteShardedBundles()));
}
results.setCoder(FileResultCoder.of(shardedWindowCoder));
if (windowedWrites) {
// When processing streaming windowed writes, results will arrive multiple times. This
// means we can't share the below implementation that turns the results into a side input,
// as new data arriving into a side input does not trigger the listening DoFn. Instead
// we aggregate the result set using a singleton GroupByKey, so the DoFn will be triggered
// whenever new data arrives.
PCollection<KV<Void, FileResult>> keyedResults = results.apply("AttachSingletonKey", WithKeys.<Void, FileResult>of((Void) null));
keyedResults.setCoder(KvCoder.of(VoidCoder.of(), FileResultCoder.of(shardedWindowCoder)));
// Is the continuation trigger sufficient?
keyedResults.apply("FinalizeGroupByKey", GroupByKey.<Void, FileResult>create()).apply("Finalize", ParDo.of(new DoFn<KV<Void, Iterable<FileResult>>, Integer>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
LOG.info("Finalizing write operation {}.", writeOperation);
List<FileResult> results = Lists.newArrayList(c.element().getValue());
writeOperation.finalize(results);
LOG.debug("Done finalizing write operation");
}
}));
} else {
final PCollectionView<Iterable<FileResult>> resultsView = results.apply(View.<FileResult>asIterable());
ImmutableList.Builder<PCollectionView<?>> sideInputs = ImmutableList.<PCollectionView<?>>builder().add(resultsView);
if (numShardsView != null) {
sideInputs.add(numShardsView);
}
// Finalize the write in another do-once ParDo on the singleton collection containing the
// Writer. The results from the per-bundle writes are given as an Iterable side input.
// The WriteOperation's state is the same as after its initialization in the first
// do-once ParDo. There is a dependency between this ParDo and the parallel write (the writer
// results collection as a side input), so it will happen after the parallel write.
// For the non-windowed case, we guarantee that if no data is written but the user has
// set numShards, then all shards will be written out as empty files. For this reason we
// use a side input here.
PCollection<Void> singletonCollection = p.apply(Create.of((Void) null));
singletonCollection.apply("Finalize", ParDo.of(new DoFn<Void, Integer>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
LOG.info("Finalizing write operation {}.", writeOperation);
List<FileResult> results = Lists.newArrayList(c.sideInput(resultsView));
LOG.debug("Side input initialized to finalize write operation {}.", writeOperation);
// We must always output at least 1 shard, and honor user-specified numShards if
// set.
int minShardsNeeded;
if (numShardsView != null) {
minShardsNeeded = c.sideInput(numShardsView);
} else if (numShardsProvider != null) {
minShardsNeeded = numShardsProvider.get();
} else {
minShardsNeeded = 1;
}
int extraShardsNeeded = minShardsNeeded - results.size();
if (extraShardsNeeded > 0) {
LOG.info("Creating {} empty output shards in addition to {} written for a total of {}.", extraShardsNeeded, results.size(), minShardsNeeded);
for (int i = 0; i < extraShardsNeeded; ++i) {
Writer<T> writer = writeOperation.createWriter();
writer.openUnwindowed(UUID.randomUUID().toString(), UNKNOWN_SHARDNUM);
FileResult emptyWrite = writer.close();
results.add(emptyWrite);
}
LOG.debug("Done creating extra shards.");
}
writeOperation.finalize(results);
LOG.debug("Done finalizing write operation {}", writeOperation);
}
}).withSideInputs(sideInputs.build()));
}
return PDone.in(input.getPipeline());
}
use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.
the class WindowFnTestUtils method validateGetOutputTimestamp.
/**
* Assigns the given {@code timestamp} to windows using the specified {@code windowFn}, and
* verifies that result of {@link WindowFn#getOutputTime windowFn.getOutputTime} for later windows
* (as defined by {@code maxTimestamp} won't prevent the watermark from passing the end of earlier
* windows.
*
* <p>This verifies that overlapping windows don't interfere at all. Depending on the
* {@code windowFn} this may be stricter than desired.
*/
public static <T, W extends BoundedWindow> void validateGetOutputTimestamp(WindowFn<T, W> windowFn, long timestamp) throws Exception {
Collection<W> windows = WindowFnTestUtils.<T, W>assignedWindows(windowFn, timestamp);
List<W> sortedWindows = new ArrayList<>(windows);
Collections.sort(sortedWindows, new Comparator<BoundedWindow>() {
@Override
public int compare(BoundedWindow o1, BoundedWindow o2) {
return o1.maxTimestamp().compareTo(o2.maxTimestamp());
}
});
Instant instant = new Instant(timestamp);
Instant endOfPrevious = null;
for (W window : sortedWindows) {
Instant outputTimestamp = windowFn.getOutputTime(instant, window);
if (endOfPrevious == null) {
// If this is the first window, the output timestamp can be anything, as long as it is in
// the valid range.
assertFalse("getOutputTime must be greater than or equal to input timestamp", outputTimestamp.isBefore(instant));
assertFalse("getOutputTime must be less than or equal to the max timestamp", outputTimestamp.isAfter(window.maxTimestamp()));
} else {
// If this is a later window, the output timestamp must be after the end of the previous
// window
assertTrue("getOutputTime must be greater than the end of the previous window", outputTimestamp.isAfter(endOfPrevious));
assertFalse("getOutputTime must be less than or equal to the max timestamp", outputTimestamp.isAfter(window.maxTimestamp()));
}
endOfPrevious = window.maxTimestamp();
}
}
use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.
the class DoFnSignatures method analyzeExtraParameter.
private static Parameter analyzeExtraParameter(ErrorReporter methodErrors, FnAnalysisContext fnContext, MethodAnalysisContext methodContext, TypeDescriptor<? extends DoFn<?, ?>> fnClass, ParameterDescription param, TypeDescriptor<?> inputT, TypeDescriptor<?> outputT) {
TypeDescriptor<?> expectedProcessContextT = doFnProcessContextTypeOf(inputT, outputT);
TypeDescriptor<?> expectedOnTimerContextT = doFnOnTimerContextTypeOf(inputT, outputT);
TypeDescriptor<?> paramT = param.getType();
Class<?> rawType = paramT.getRawType();
ErrorReporter paramErrors = methodErrors.forParameter(param);
if (rawType.equals(DoFn.ProcessContext.class)) {
paramErrors.checkArgument(paramT.equals(expectedProcessContextT), "ProcessContext argument must have type %s", formatType(expectedProcessContextT));
return Parameter.processContext();
} else if (rawType.equals(DoFn.OnTimerContext.class)) {
paramErrors.checkArgument(paramT.equals(expectedOnTimerContextT), "OnTimerContext argument must have type %s", formatType(expectedOnTimerContextT));
return Parameter.onTimerContext();
} else if (BoundedWindow.class.isAssignableFrom(rawType)) {
methodErrors.checkArgument(!methodContext.hasWindowParameter(), "Multiple %s parameters", BoundedWindow.class.getSimpleName());
return Parameter.boundedWindow((TypeDescriptor<? extends BoundedWindow>) paramT);
} else if (RestrictionTracker.class.isAssignableFrom(rawType)) {
methodErrors.checkArgument(!methodContext.hasRestrictionTrackerParameter(), "Multiple %s parameters", RestrictionTracker.class.getSimpleName());
return Parameter.restrictionTracker(paramT);
} else if (rawType.equals(Timer.class)) {
// m.getParameters() is not available until Java 8
String id = getTimerId(param.getAnnotations());
paramErrors.checkArgument(id != null, "%s missing %s annotation", Timer.class.getSimpleName(), TimerId.class.getSimpleName());
paramErrors.checkArgument(!methodContext.getTimerParameters().containsKey(id), "duplicate %s: \"%s\"", TimerId.class.getSimpleName(), id);
TimerDeclaration timerDecl = fnContext.getTimerDeclarations().get(id);
paramErrors.checkArgument(timerDecl != null, "reference to undeclared %s: \"%s\"", TimerId.class.getSimpleName(), id);
paramErrors.checkArgument(timerDecl.field().getDeclaringClass().equals(param.getMethod().getDeclaringClass()), "%s %s declared in a different class %s." + " Timers may be referenced only in the lexical scope where they are declared.", TimerId.class.getSimpleName(), id, timerDecl.field().getDeclaringClass().getName());
return Parameter.timerParameter(timerDecl);
} else if (State.class.isAssignableFrom(rawType)) {
// m.getParameters() is not available until Java 8
String id = getStateId(param.getAnnotations());
paramErrors.checkArgument(id != null, "missing %s annotation", DoFn.StateId.class.getSimpleName());
paramErrors.checkArgument(!methodContext.getStateParameters().containsKey(id), "duplicate %s: \"%s\"", DoFn.StateId.class.getSimpleName(), id);
// By static typing this is already a well-formed State subclass
TypeDescriptor<? extends State> stateType = (TypeDescriptor<? extends State>) param.getType();
StateDeclaration stateDecl = fnContext.getStateDeclarations().get(id);
paramErrors.checkArgument(stateDecl != null, "reference to undeclared %s: \"%s\"", DoFn.StateId.class.getSimpleName(), id);
paramErrors.checkArgument(stateDecl.stateType().equals(stateType), "reference to %s %s with different type %s", StateId.class.getSimpleName(), id, formatType(stateDecl.stateType()));
paramErrors.checkArgument(stateDecl.field().getDeclaringClass().equals(param.getMethod().getDeclaringClass()), "%s %s declared in a different class %s." + " State may be referenced only in the class where it is declared.", StateId.class.getSimpleName(), id, stateDecl.field().getDeclaringClass().getName());
return Parameter.stateParameter(stateDecl);
} else {
List<String> allowedParamTypes = Arrays.asList(formatType(new TypeDescriptor<BoundedWindow>() {
}), formatType(new TypeDescriptor<RestrictionTracker<?>>() {
}));
paramErrors.throwIllegalArgument("%s is not a valid context parameter. Should be one of %s", formatType(paramT), allowedParamTypes);
// Unreachable
return null;
}
}
Aggregations