use of org.apache.beam.sdk.values.KV in project beam by apache.
the class DirectGroupByKeyOverrideFactoryTest method getInputSucceeds.
@Test
public void getInputSucceeds() {
TestPipeline p = TestPipeline.create();
PCollection<KV<String, Integer>> input = p.apply(Create.of(KV.of("foo", 1)).withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())));
PCollection<KV<String, Iterable<Integer>>> grouped = input.apply(GroupByKey.<String, Integer>create());
AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(grouped);
PTransformReplacement<PCollection<KV<String, Integer>>, PCollection<KV<String, Iterable<Integer>>>> replacement = factory.getReplacementTransform((AppliedPTransform) producer);
assertThat(replacement.getInput(), Matchers.<PCollection<?>>equalTo(input));
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class DirectRunnerTest method wordCountShouldSucceed.
@Test
public void wordCountShouldSucceed() throws Throwable {
Pipeline p = getPipeline();
PCollection<KV<String, Long>> counts = p.apply(Create.of("foo", "bar", "foo", "baz", "bar", "foo")).apply(MapElements.via(new SimpleFunction<String, String>() {
@Override
public String apply(String input) {
return input;
}
})).apply(Count.<String>perElement());
PCollection<String> countStrs = counts.apply(MapElements.via(new SimpleFunction<KV<String, Long>, String>() {
@Override
public String apply(KV<String, Long> input) {
String str = String.format("%s: %s", input.getKey(), input.getValue());
return str;
}
}));
PAssert.that(countStrs).containsInAnyOrder("baz: 1", "bar: 2", "foo: 3");
DirectPipelineResult result = ((DirectPipelineResult) p.run());
result.waitUntilFinish();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class ResumeFromCheckpointStreamingTest method run.
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
KafkaIO.Read<String, Instant> read = KafkaIO.<String, Instant>read().withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList()).withTopics(Collections.singletonList(TOPIC)).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(InstantDeserializer.class).updateConsumerProperties(ImmutableMap.<String, Object>of("auto.offset.reset", "earliest")).withTimestampFn(new SerializableFunction<KV<String, Instant>, Instant>() {
@Override
public Instant apply(KV<String, Instant> kv) {
return kv.getValue();
}
}).withWatermarkFn(new SerializableFunction<KV<String, Instant>, Instant>() {
@Override
public Instant apply(KV<String, Instant> kv) {
// at EOF move WM to infinity.
String key = kv.getKey();
Instant instant = kv.getValue();
return key.equals("EOF") ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
}
});
TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
options.setSparkMaster("local[*]");
options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
options.setExpectedAssertions(expectedAssertions);
options.setRunner(TestSparkRunner.class);
options.setEnableSparkMetricSinks(false);
options.setForceStreaming(true);
options.setCheckpointDir(temporaryFolder.getRoot().getPath());
// timeout is per execution so it can be injected by the caller.
if (stopWatermarkOption.isPresent()) {
options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
}
Pipeline p = Pipeline.create(options);
PCollection<String> expectedCol = p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
PCollectionView<List<String>> view = expectedCol.apply(View.<String>asList());
PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());
PCollection<Iterable<String>> grouped = kafkaStream.apply(Keys.<String>create()).apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view)).apply(Window.<String>into(FixedWindows.of(Duration.millis(500))).triggering(AfterWatermark.pastEndOfWindow()).accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(WithKeys.<Integer, String>of(1)).apply(GroupByKey.<Integer, String>create()).apply(Values.<Iterable<String>>create());
grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));
return (SparkPipelineResult) p.run();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class SparkKeyedCombineFn method mergeCombiners.
/**
* Implements Spark's mergeCombiners function in:
* <p>
* {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
* </p>
*/
Iterable<WindowedValue<KV<K, AccumT>>> mergeCombiners(Iterable<WindowedValue<KV<K, AccumT>>> a1, Iterable<WindowedValue<KV<K, AccumT>>> a2) {
// concatenate accumulators.
Iterable<WindowedValue<KV<K, AccumT>>> accumulators = Iterables.concat(a1, a2);
// sort accumulators, no need to explode since inputs were exploded.
Iterable<WindowedValue<KV<K, AccumT>>> sortedAccumulators = sortByWindows(accumulators);
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
//--- accumulators iterator, by window order.
final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedAccumulators.iterator();
// get the first accumulator and assign it to the current window's accumulators.
WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
K key = currentValue.getValue().getKey();
BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
List<AccumT> currentWindowAccumulators = Lists.newArrayList();
currentWindowAccumulators.add(currentValue.getValue().getValue());
// keep track of the timestamps assigned by the TimestampCombiner,
// in createCombiner we already merge the timestamps assigned
// to individual elements, here we will just merge them.
List<Instant> windowTimestamps = Lists.newArrayList();
windowTimestamps.add(currentValue.getTimestamp());
// accumulate the next windows, or output.
List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// add to window accumulators.
currentWindowAccumulators.add(nextValue.getValue().getValue());
windowTimestamps.add(nextValue.getTimestamp());
} else {
// before moving to the next window,
// add the current accumulation to the output and initialize the accumulation.
// merge the timestamps of all accumulators to merge.
Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
// merge accumulators.
// transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
// for the (possibly merged) window.
Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
// applying the actual combiner onto the accumulators.
AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
// emit the accumulated output.
output.add(postMergeWindowedValue);
// re-init accumulator, window and timestamps.
currentWindowAccumulators.clear();
currentWindowAccumulators.add(nextValue.getValue().getValue());
currentWindow = nextWindow;
windowTimestamps.clear();
windowTimestamps.add(nextValue.getTimestamp());
}
}
// merge the last chunk of accumulators.
Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
output.add(postMergeWindowedValue);
return output;
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class SparkKeyedCombineFn method createCombiner.
/**
* Implements Spark's createCombiner function in:
* <p>
* {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
* </p>
*/
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
// sort exploded inputs.
Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
//--- inputs iterator, by window order.
final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
WindowedValue<KV<K, InputT>> currentInput = iterator.next();
BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
// first create the accumulator and accumulate first input.
K key = currentInput.getValue().getKey();
AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
// keep track of the timestamps assigned by the TimestampCombiner.
Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
// accumulate the next windows, or output.
List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<KV<K, InputT>> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// keep accumulating and carry on ;-)
accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
} else {
// moving to the next window, first add the current accumulation to output
// and initialize the accumulator.
output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
// re-init accumulator, window and timestamp.
accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
currentWindow = nextWindow;
windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
}
}
// add last accumulator to the output.
output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
return output;
}
Aggregations