Search in sources :

Example 86 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class DirectGroupByKeyOverrideFactoryTest method getInputSucceeds.

@Test
public void getInputSucceeds() {
    TestPipeline p = TestPipeline.create();
    PCollection<KV<String, Integer>> input = p.apply(Create.of(KV.of("foo", 1)).withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())));
    PCollection<KV<String, Iterable<Integer>>> grouped = input.apply(GroupByKey.<String, Integer>create());
    AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(grouped);
    PTransformReplacement<PCollection<KV<String, Integer>>, PCollection<KV<String, Iterable<Integer>>>> replacement = factory.getReplacementTransform((AppliedPTransform) producer);
    assertThat(replacement.getInput(), Matchers.<PCollection<?>>equalTo(input));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 87 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class DirectRunnerTest method wordCountShouldSucceed.

@Test
public void wordCountShouldSucceed() throws Throwable {
    Pipeline p = getPipeline();
    PCollection<KV<String, Long>> counts = p.apply(Create.of("foo", "bar", "foo", "baz", "bar", "foo")).apply(MapElements.via(new SimpleFunction<String, String>() {

        @Override
        public String apply(String input) {
            return input;
        }
    })).apply(Count.<String>perElement());
    PCollection<String> countStrs = counts.apply(MapElements.via(new SimpleFunction<KV<String, Long>, String>() {

        @Override
        public String apply(KV<String, Long> input) {
            String str = String.format("%s: %s", input.getKey(), input.getValue());
            return str;
        }
    }));
    PAssert.that(countStrs).containsInAnyOrder("baz: 1", "bar: 2", "foo: 3");
    DirectPipelineResult result = ((DirectPipelineResult) p.run());
    result.waitUntilFinish();
}
Also used : SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) KV(org.apache.beam.sdk.values.KV) DirectPipelineResult(org.apache.beam.runners.direct.DirectRunner.DirectPipelineResult) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 88 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class ResumeFromCheckpointStreamingTest method run.

@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
    KafkaIO.Read<String, Instant> read = KafkaIO.<String, Instant>read().withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList()).withTopics(Collections.singletonList(TOPIC)).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(InstantDeserializer.class).updateConsumerProperties(ImmutableMap.<String, Object>of("auto.offset.reset", "earliest")).withTimestampFn(new SerializableFunction<KV<String, Instant>, Instant>() {

        @Override
        public Instant apply(KV<String, Instant> kv) {
            return kv.getValue();
        }
    }).withWatermarkFn(new SerializableFunction<KV<String, Instant>, Instant>() {

        @Override
        public Instant apply(KV<String, Instant> kv) {
            // at EOF move WM to infinity.
            String key = kv.getKey();
            Instant instant = kv.getValue();
            return key.equals("EOF") ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
        }
    });
    TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
    options.setSparkMaster("local[*]");
    options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
    options.setExpectedAssertions(expectedAssertions);
    options.setRunner(TestSparkRunner.class);
    options.setEnableSparkMetricSinks(false);
    options.setForceStreaming(true);
    options.setCheckpointDir(temporaryFolder.getRoot().getPath());
    // timeout is per execution so it can be injected by the caller.
    if (stopWatermarkOption.isPresent()) {
        options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
    }
    Pipeline p = Pipeline.create(options);
    PCollection<String> expectedCol = p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
    PCollectionView<List<String>> view = expectedCol.apply(View.<String>asList());
    PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());
    PCollection<Iterable<String>> grouped = kafkaStream.apply(Keys.<String>create()).apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view)).apply(Window.<String>into(FixedWindows.of(Duration.millis(500))).triggering(AfterWatermark.pastEndOfWindow()).accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(WithKeys.<Integer, String>of(1)).apply(GroupByKey.<Integer, String>create()).apply(Values.<Iterable<String>>create());
    grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));
    return (SparkPipelineResult) p.run();
}
Also used : SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) KafkaIO(org.apache.beam.sdk.io.kafka.KafkaIO) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) SparkPipelineResult(org.apache.beam.runners.spark.SparkPipelineResult) InstantDeserializer(org.apache.beam.sdk.io.kafka.serialization.InstantDeserializer) TestSparkPipelineOptions(org.apache.beam.runners.spark.TestSparkPipelineOptions) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList)

Example 89 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SparkKeyedCombineFn method mergeCombiners.

/**
   * Implements Spark's mergeCombiners function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> mergeCombiners(Iterable<WindowedValue<KV<K, AccumT>>> a1, Iterable<WindowedValue<KV<K, AccumT>>> a2) {
    // concatenate accumulators.
    Iterable<WindowedValue<KV<K, AccumT>>> accumulators = Iterables.concat(a1, a2);
    // sort accumulators, no need to explode since inputs were exploded.
    Iterable<WindowedValue<KV<K, AccumT>>> sortedAccumulators = sortByWindows(accumulators);
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    //--- accumulators iterator, by window order.
    final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedAccumulators.iterator();
    // get the first accumulator and assign it to the current window's accumulators.
    WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
    K key = currentValue.getValue().getKey();
    BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
    List<AccumT> currentWindowAccumulators = Lists.newArrayList();
    currentWindowAccumulators.add(currentValue.getValue().getValue());
    // keep track of the timestamps assigned by the TimestampCombiner,
    // in createCombiner we already merge the timestamps assigned
    // to individual elements, here we will just merge them.
    List<Instant> windowTimestamps = Lists.newArrayList();
    windowTimestamps.add(currentValue.getTimestamp());
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // add to window accumulators.
            currentWindowAccumulators.add(nextValue.getValue().getValue());
            windowTimestamps.add(nextValue.getTimestamp());
        } else {
            // before moving to the next window,
            // add the current accumulation to the output and initialize the accumulation.
            // merge the timestamps of all accumulators to merge.
            Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
            // merge accumulators.
            // transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
            // for the (possibly merged) window.
            Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
            WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
            // applying the actual combiner onto the accumulators.
            AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
            WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
            // emit the accumulated output.
            output.add(postMergeWindowedValue);
            // re-init accumulator, window and timestamps.
            currentWindowAccumulators.clear();
            currentWindowAccumulators.add(nextValue.getValue().getValue());
            currentWindow = nextWindow;
            windowTimestamps.clear();
            windowTimestamps.add(nextValue.getTimestamp());
        }
    }
    // merge the last chunk of accumulators.
    Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
    Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
    WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
    AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
    WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
    output.add(postMergeWindowedValue);
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 90 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SparkKeyedCombineFn method createCombiner.

/**
   * Implements Spark's createCombiner function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
    // sort exploded inputs.
    Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
    TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
    //--- inputs iterator, by window order.
    final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
    WindowedValue<KV<K, InputT>> currentInput = iterator.next();
    BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
    // first create the accumulator and accumulate first input.
    K key = currentInput.getValue().getKey();
    AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
    accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
    // keep track of the timestamps assigned by the TimestampCombiner.
    Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, InputT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // keep accumulating and carry on ;-)
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
        } else {
            // moving to the next window, first add the current accumulation to output
            // and initialize the accumulator.
            output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
            // re-init accumulator, window and timestamp.
            accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            currentWindow = nextWindow;
            windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
        }
    }
    // add last accumulator to the output.
    output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12