use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class MultiStepCombineTest method testMultiStepCombineTimestampCombiner.
@Test
public void testMultiStepCombineTimestampCombiner() {
TimestampCombiner combiner = TimestampCombiner.LATEST;
PCollection<KV<String, Long>> combined = pipeline.apply(Create.timestamped(TimestampedValue.of(KV.of("foo", 4L), new Instant(1L)), TimestampedValue.of(KV.of("foo", 1L), new Instant(4L)), TimestampedValue.of(KV.of("bazzle", 4L), new Instant(4L)), TimestampedValue.of(KV.of("foo", 12L), new Instant(12L)))).apply(Window.<KV<String, Long>>into(FixedWindows.of(Duration.millis(5L))).withTimestampCombiner(combiner)).apply(Combine.perKey(new MultiStepCombineFn()));
PCollection<KV<String, TimestampedValue<Long>>> reified = combined.apply(ParDo.of(new DoFn<KV<String, Long>, KV<String, TimestampedValue<Long>>>() {
@ProcessElement
public void reifyTimestamp(ProcessContext context) {
context.output(KV.of(context.element().getKey(), TimestampedValue.of(context.element().getValue(), context.timestamp())));
}
}));
PAssert.that(reified).containsInAnyOrder(KV.of("foo", TimestampedValue.of(5L, new Instant(4L))), KV.of("bazzle", TimestampedValue.of(4L, new Instant(4L))), KV.of("foo", TimestampedValue.of(12L, new Instant(12L))));
pipeline.run();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class StatefulParDoEvaluatorFactoryTest method testUnprocessedElements.
/**
* A test that explicitly delays a side input so that the main input will have to be reprocessed,
* testing that {@code finishBundle()} re-assembles the GBK outputs correctly.
*/
@Test
public void testUnprocessedElements() throws Exception {
// To test the factory, first we set up a pipeline and then we use the constructed
// pipeline to create the right parameters to pass to the factory
final String stateId = "my-state-id";
// For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
PCollection<KV<String, Integer>> mainInput = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.into(FixedWindows.of(Duration.millis(10))));
final PCollectionView<List<Integer>> sideInput = pipeline.apply("Create side input", Create.of(42)).apply("Window side input", Window.into(FixedWindows.of(Duration.millis(10)))).apply("View side input", View.asList());
TupleTag<Integer> mainOutput = new TupleTag<>();
PCollection<Integer> produced = mainInput.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(new DoFn<KV<String, Integer>, Integer>() {
@StateId(stateId)
private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());
@ProcessElement
public void process(ProcessContext c) {
}
}, mainOutput, TupleTagList.empty(), Collections.singletonList(sideInput), DoFnSchemaInformation.create(), Collections.emptyMap())).get(mainOutput).setCoder(VarIntCoder.of());
StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory<>(mockEvaluationContext, options);
// This will be the stateful ParDo from the expansion
AppliedPTransform<PCollection<KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
// Then there will be a digging down to the step context to get the state internals
when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
when(mockExecutionContext.getStepContext(any())).thenReturn(mockStepContext);
when(mockEvaluationContext.createBundle(Matchers.<PCollection<Integer>>any())).thenReturn(mockUncommittedBundle);
when(mockStepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
// And digging to check whether the window is ready
when(mockEvaluationContext.createSideInputReader(anyList())).thenReturn(mockSideInputReader);
when(mockSideInputReader.isReady(Matchers.any(), Matchers.any())).thenReturn(false);
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
// A single bundle with some elements in the global window; it should register cleanup for the
// global window state merely by having the evaluator created. The cleanup logic does not
// depend on the window.
String key = "hello";
WindowedValue<KV<String, Integer>> firstKv = WindowedValue.of(KV.of(key, 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING);
WindowedValue<KeyedWorkItem<String, KV<String, Integer>>> gbkOutputElement = firstKv.withValue(KeyedWorkItems.elementsWorkItem("hello", ImmutableList.of(firstKv, firstKv.withValue(KV.of(key, 13)), firstKv.withValue(KV.of(key, 15)))));
CommittedBundle<KeyedWorkItem<String, KV<String, Integer>>> inputBundle = BUNDLE_FACTORY.createBundle((PCollection<KeyedWorkItem<String, KV<String, Integer>>>) Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(producingTransform))).add(gbkOutputElement).commit(Instant.now());
TransformEvaluator<KeyedWorkItem<String, KV<String, Integer>>> evaluator = factory.forApplication(producingTransform, inputBundle);
evaluator.processElement(gbkOutputElement);
// This should push back every element as a KV<String, Iterable<Integer>>
// in the appropriate window. Since the keys are equal they are single-threaded
TransformResult<KeyedWorkItem<String, KV<String, Integer>>> result = evaluator.finishBundle();
List<Integer> pushedBackInts = new ArrayList<>();
for (WindowedValue<? extends KeyedWorkItem<String, KV<String, Integer>>> unprocessedElement : result.getUnprocessedElements()) {
assertThat(Iterables.getOnlyElement(unprocessedElement.getWindows()), equalTo((BoundedWindow) firstWindow));
assertThat(unprocessedElement.getValue().key(), equalTo("hello"));
for (WindowedValue<KV<String, Integer>> windowedKv : unprocessedElement.getValue().elementsIterable()) {
pushedBackInts.add(windowedKv.getValue().getValue());
}
}
assertThat(pushedBackInts, containsInAnyOrder(1, 13, 15));
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class GroupByWithNullValuesTest method testGroupByWithNullValues.
@Test
public void testGroupByWithNullValues() {
FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
options.setRunner(TestFlinkRunner.class);
options.setStreaming(true);
Pipeline pipeline = Pipeline.create(options);
PCollection<Integer> result = pipeline.apply(GenerateSequence.from(0).to(100).withTimestampFn(new SerializableFunction<Long, Instant>() {
@Override
public Instant apply(Long input) {
return new Instant(input);
}
})).apply(Window.into(FixedWindows.of(Duration.millis(10)))).apply(ParDo.of(new DoFn<Long, KV<String, Void>>() {
@ProcessElement
public void processElement(ProcessContext pc) {
pc.output(KV.of("hello", null));
}
})).apply(GroupByKey.create()).apply(ParDo.of(new DoFn<KV<String, Iterable<Void>>, Integer>() {
@ProcessElement
public void processElement(ProcessContext pc) {
int count = 0;
for (Void aVoid : pc.element().getValue()) {
assertNull("Element should be null", aVoid);
count++;
}
pc.output(count);
}
}));
PAssert.that(result).containsInAnyOrder(10, 10, 10, 10, 10, 10, 10, 10, 10, 10);
pipeline.run();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class DoFnOperatorTest method testWatermarkUpdateAfterWatermarkHoldRelease.
@Test
public void testWatermarkUpdateAfterWatermarkHoldRelease() throws Exception {
Coder<WindowedValue<KV<String, String>>> coder = WindowedValue.getValueOnlyCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
TupleTag<KV<String, String>> outputTag = new TupleTag<>("main-output");
List<Long> emittedWatermarkHolds = new ArrayList<>();
KeySelector<WindowedValue<KV<String, String>>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue().getKey(), StringUtf8Coder.of());
DoFnOperator<KV<String, String>, KV<String, String>> doFnOperator = new DoFnOperator<KV<String, String>, KV<String, String>>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.globalDefault(), new HashMap<>(), /* side-input mapping */
Collections.emptyList(), /* side inputs */
FlinkPipelineOptions.defaults(), StringUtf8Coder.of(), keySelector, DoFnSchemaInformation.create(), Collections.emptyMap()) {
@Override
protected DoFnRunner<KV<String, String>, KV<String, String>> createWrappingDoFnRunner(DoFnRunner<KV<String, String>, KV<String, String>> wrappedRunner, StepContext stepContext) {
StateNamespace namespace = StateNamespaces.window(GlobalWindow.Coder.INSTANCE, GlobalWindow.INSTANCE);
StateTag<WatermarkHoldState> holdTag = StateTags.watermarkStateInternal("hold", TimestampCombiner.LATEST);
WatermarkHoldState holdState = stepContext.stateInternals().state(namespace, holdTag);
TimerInternals timerInternals = stepContext.timerInternals();
return new DoFnRunner<KV<String, String>, KV<String, String>>() {
@Override
public void startBundle() {
wrappedRunner.startBundle();
}
@Override
public void processElement(WindowedValue<KV<String, String>> elem) {
wrappedRunner.processElement(elem);
holdState.add(elem.getTimestamp());
timerInternals.setTimer(namespace, "timer", "family", elem.getTimestamp().plus(Duration.millis(1)), elem.getTimestamp().plus(Duration.millis(1)), TimeDomain.EVENT_TIME);
timerInternals.setTimer(namespace, "cleanup", "", GlobalWindow.INSTANCE.maxTimestamp(), GlobalWindow.INSTANCE.maxTimestamp(), TimeDomain.EVENT_TIME);
}
@Override
public <KeyT> void onTimer(String timerId, String timerFamilyId, KeyT key, BoundedWindow window, Instant timestamp, Instant outputTimestamp, TimeDomain timeDomain) {
if ("cleanup".equals(timerId)) {
holdState.clear();
} else {
holdState.add(outputTimestamp);
}
}
@Override
public void finishBundle() {
wrappedRunner.finishBundle();
}
@Override
public <KeyT> void onWindowExpiration(BoundedWindow window, Instant timestamp, KeyT key) {
wrappedRunner.onWindowExpiration(window, timestamp, key);
}
@Override
public DoFn<KV<String, String>, KV<String, String>> getFn() {
return doFn;
}
};
}
@Override
void emitWatermarkIfHoldChanged(long currentWatermarkHold) {
emittedWatermarkHolds.add(keyedStateInternals.minWatermarkHoldMs());
}
};
OneInputStreamOperatorTestHarness<WindowedValue<KV<String, String>>, WindowedValue<KV<String, String>>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
testHarness.setup();
Instant now = Instant.now();
testHarness.open();
// process first element, set hold to `now', setup timer for `now + 1'
testHarness.processElement(new StreamRecord<>(WindowedValue.timestampedValueInGlobalWindow(KV.of("Key", "Hello"), now)));
assertThat(emittedWatermarkHolds, is(equalTo(Collections.singletonList(now.getMillis()))));
// fire timer, change hold to `now + 2'
testHarness.processWatermark(now.getMillis() + 2);
assertThat(emittedWatermarkHolds, is(equalTo(Arrays.asList(now.getMillis(), now.getMillis() + 1))));
// process second element, verify we emitted changed hold
testHarness.processElement(new StreamRecord<>(WindowedValue.timestampedValueInGlobalWindow(KV.of("Key", "Hello"), now.plus(Duration.millis(2)))));
assertThat(emittedWatermarkHolds, is(equalTo(Arrays.asList(now.getMillis(), now.getMillis() + 1, now.getMillis() + 2))));
testHarness.processWatermark(GlobalWindow.INSTANCE.maxTimestamp().plus(Duration.millis(1)).getMillis());
testHarness.processWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis());
testHarness.close();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class FlinkPipelineExecutionEnvironmentTest method shouldLogWarningWhenCheckpointingIsDisabled.
@Test
public void shouldLogWarningWhenCheckpointingIsDisabled() {
Pipeline pipeline = Pipeline.create();
pipeline.getOptions().setRunner(TestFlinkRunner.class);
pipeline.apply(GenerateSequence.from(0)).apply(ParDo.of(new DoFn<Long, Void>() {
@ProcessElement
public void processElement(ProcessContext ctx) {
throw new RuntimeException("Failing here is ok.");
}
}));
final PrintStream oldErr = System.err;
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
PrintStream replacementStdErr = new PrintStream(byteArrayOutputStream);
try {
System.setErr(replacementStdErr);
// Run pipeline and fail during execution
pipeline.run();
fail("Should have failed");
} catch (Exception e) {
// We want to fail here
} finally {
System.setErr(oldErr);
}
replacementStdErr.flush();
assertThat(new String(byteArrayOutputStream.toByteArray(), Charsets.UTF_8), containsString("UnboundedSources present which rely on checkpointing, but checkpointing is disabled."));
}
Aggregations