use of org.apache.beam.sdk.transforms.windowing.GlobalWindows in project beam by apache.
the class SimpleDoFnRunnerTest method testSkew.
/**
* Demonstrates that attempting to output an element before the timestamp of the current element
* plus the value of {@link DoFn#getAllowedTimestampSkew()} throws, but between that value and
* the current timestamp succeeds.
*/
@Test
public void testSkew() {
SkewingDoFn fn = new SkewingDoFn(Duration.standardMinutes(10L));
DoFnRunner<Duration, Duration> runner = new SimpleDoFnRunner<>(null, fn, NullSideInputReader.empty(), new ListOutputManager(), new TupleTag<Duration>(), Collections.<TupleTag<?>>emptyList(), mockStepContext, WindowingStrategy.of(new GlobalWindows()));
runner.startBundle();
// Outputting between "now" and "now - allowed skew" succeeds.
runner.processElement(WindowedValue.timestampedValueInGlobalWindow(Duration.standardMinutes(5L), new Instant(0)));
thrown.expect(UserCodeException.class);
thrown.expectCause(isA(IllegalArgumentException.class));
thrown.expectMessage("must be no earlier");
thrown.expectMessage(String.format("timestamp of the current input (%s)", new Instant(0).toString()));
thrown.expectMessage(String.format("the allowed skew (%s)", PeriodFormat.getDefault().print(Duration.standardMinutes(10L).toPeriod())));
// Outputting before "now - allowed skew" fails.
runner.processElement(WindowedValue.timestampedValueInGlobalWindow(Duration.standardHours(1L), new Instant(0)));
}
use of org.apache.beam.sdk.transforms.windowing.GlobalWindows in project beam by apache.
the class ReduceFnRunnerTest method fireNonEmptyOnDrainInGlobalWindow.
/**
* We should fire a non-empty ON_TIME pane in the GlobalWindow when the watermark moves to
* end-of-time.
*/
@Test
public void fireNonEmptyOnDrainInGlobalWindow() throws Exception {
ReduceFnTester<Integer, Iterable<Integer>, GlobalWindow> tester = ReduceFnTester.nonCombining(WindowingStrategy.of(new GlobalWindows()).withTrigger(Repeatedly.<GlobalWindow>forever(AfterPane.elementCountAtLeast(3))).withMode(AccumulationMode.DISCARDING_FIRED_PANES));
tester.advanceInputWatermark(new Instant(0));
final int n = 20;
for (int i = 0; i < n; i++) {
tester.injectElements(TimestampedValue.of(i, new Instant(i)));
}
List<WindowedValue<Iterable<Integer>>> output = tester.extractOutput();
assertEquals(n / 3, output.size());
for (int i = 0; i < output.size(); i++) {
assertEquals(Timing.EARLY, output.get(i).getPane().getTiming());
assertEquals(i, output.get(i).getPane().getIndex());
assertEquals(3, Iterables.size(output.get(i).getValue()));
}
tester.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
output = tester.extractOutput();
assertEquals(1, output.size());
assertEquals(Timing.ON_TIME, output.get(0).getPane().getTiming());
assertEquals(n / 3, output.get(0).getPane().getIndex());
assertEquals(n - ((n / 3) * 3), Iterables.size(output.get(0).getValue()));
}
use of org.apache.beam.sdk.transforms.windowing.GlobalWindows in project beam by apache.
the class BatchLoads method expand.
@Override
public WriteResult expand(PCollection<KV<DestinationT, TableRow>> input) {
Pipeline p = input.getPipeline();
final String stepUuid = BigQueryHelpers.randomUUIDString();
PCollectionView<String> tempFilePrefix = p.apply("Create", Create.of((Void) null)).apply("GetTempFilePrefix", ParDo.of(new DoFn<Void, String>() {
@ProcessElement
public void getTempFilePrefix(ProcessContext c) {
c.output(resolveTempLocation(c.getPipelineOptions().getTempLocation(), "BigQueryWriteTemp", stepUuid));
}
})).apply("TempFilePrefixView", View.<String>asSingleton());
// Create a singleton job ID token at execution time. This will be used as the base for all
// load jobs issued from this instance of the transform.
PCollectionView<String> jobIdTokenView = p.apply("TriggerIdCreation", Create.of("ignored")).apply("CreateJobId", MapElements.via(new SimpleFunction<String, String>() {
@Override
public String apply(String input) {
return stepUuid;
}
})).apply(View.<String>asSingleton());
PCollection<KV<DestinationT, TableRow>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, TableRow>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
PCollectionView<Map<DestinationT, String>> schemasView = inputInGlobalWindow.apply(new CalculateSchemas<>(dynamicDestinations));
TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {
};
TupleTag<KV<ShardedKey<DestinationT>, TableRow>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, TableRow>>("unwrittenRecords") {
};
PCollectionTuple writeBundlesTuple = inputInGlobalWindow.apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles<>(stepUuid, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize)).withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple.get(writtenFilesTag).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
// If the bundles contain too many output tables to be written inline to files (due to memory
// limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
// Group these records by key, and write the files after grouping. Since the record is grouped
// by key, we can ensure that only one file is open at a time in each bundle.
PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeBundlesTuple.get(unwrittedRecordsTag).setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), TableRowJsonCoder.of())).apply(GroupByKey.<ShardedKey<DestinationT>, TableRow>create()).apply(ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)).withSideInputs(tempFilePrefix)).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
// PCollection of filename, file byte size, and table destination.
PCollection<WriteBundlesToFiles.Result<DestinationT>> results = PCollectionList.of(writtenFiles).and(writtenFilesGrouped).apply(Flatten.<Result<DestinationT>>pCollections());
TupleTag<KV<ShardedKey<DestinationT>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("multiPartitionsTag") {
};
TupleTag<KV<ShardedKey<DestinationT>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("singlePartitionTag") {
};
// Turn the list of files and record counts in a PCollectionView that can be used as a
// side input.
PCollectionView<Iterable<WriteBundlesToFiles.Result<DestinationT>>> resultsView = results.apply("ResultsView", View.<WriteBundlesToFiles.Result<DestinationT>>asIterable());
// This transform will look at the set of files written for each table, and if any table has
// too many files or bytes, will partition that table's files into multiple partitions for
// loading.
PCollection<Void> singleton = p.apply("singleton", Create.of((Void) null).withCoder(VoidCoder.of()));
PCollectionTuple partitions = singleton.apply("WritePartition", ParDo.of(new WritePartition<>(singletonTable, tempFilePrefix, resultsView, multiPartitionsTag, singlePartitionTag)).withSideInputs(tempFilePrefix, resultsView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
List<PCollectionView<?>> writeTablesSideInputs = Lists.newArrayList(jobIdTokenView, schemasView);
writeTablesSideInputs.addAll(dynamicDestinations.getSideInputs());
Coder<KV<ShardedKey<DestinationT>, List<String>>> partitionsCoder = KvCoder.of(ShardedKeyCoder.of(NullableCoder.of(destinationCoder)), ListCoder.of(StringUtf8Coder.of()));
// If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
// the import needs to be split into multiple partitions, and those partitions will be
// specified in multiPartitionsTag.
PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag).setCoder(partitionsCoder).apply("MultiPartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables<>(false, bigQueryServices, jobIdTokenView, schemasView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
// This view maps each final table destination to the set of temporary partitioned tables
// the PCollection was loaded into.
PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables.apply("TempTablesView", View.<TableDestination, String>asMultimap());
singleton.apply("WriteRename", ParDo.of(new WriteRename(bigQueryServices, jobIdTokenView, writeDisposition, createDisposition, tempTablesView)).withSideInputs(tempTablesView, jobIdTokenView));
// Write single partition to final table
partitions.get(singlePartitionTag).setCoder(partitionsCoder).apply("SinglePartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("SinglePartitionWriteTables", ParDo.of(new WriteTables<>(true, bigQueryServices, jobIdTokenView, schemasView, writeDisposition, createDisposition, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
PCollection<TableRow> empty = p.apply("CreateEmptyFailedInserts", Create.empty(TypeDescriptor.of(TableRow.class)));
return WriteResult.in(input.getPipeline(), new TupleTag<TableRow>("failedInserts"), empty);
}
use of org.apache.beam.sdk.transforms.windowing.GlobalWindows in project beam by apache.
the class RepeatedlyStateMachineTest method testRepeatedlyElementCount.
@Test
public void testRepeatedlyElementCount() throws Exception {
SimpleTriggerStateMachineTester<GlobalWindow> tester = TriggerStateMachineTester.forTrigger(RepeatedlyStateMachine.forever(AfterPaneStateMachine.elementCountAtLeast(5)), new GlobalWindows());
GlobalWindow window = GlobalWindow.INSTANCE;
tester.injectElements(1);
assertFalse(tester.shouldFire(window));
tester.injectElements(2, 3, 4, 5);
assertTrue(tester.shouldFire(window));
tester.fireIfShouldFire(window);
assertFalse(tester.shouldFire(window));
}
use of org.apache.beam.sdk.transforms.windowing.GlobalWindows in project beam by apache.
the class RepeatedlyStateMachineTest method testRepeatedlyAfterFirstElementCount.
@Test
public void testRepeatedlyAfterFirstElementCount() throws Exception {
SimpleTriggerStateMachineTester<GlobalWindow> tester = TriggerStateMachineTester.forTrigger(RepeatedlyStateMachine.forever(AfterFirstStateMachine.of(AfterProcessingTimeStateMachine.pastFirstElementInPane().plusDelayOf(Duration.standardMinutes(15)), AfterPaneStateMachine.elementCountAtLeast(5))), new GlobalWindows());
GlobalWindow window = GlobalWindow.INSTANCE;
tester.injectElements(1);
assertFalse(tester.shouldFire(window));
tester.injectElements(2, 3, 4, 5);
assertTrue(tester.shouldFire(window));
tester.fireIfShouldFire(window);
assertFalse(tester.shouldFire(window));
}
Aggregations