Search in sources :

Example 26 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SourceTestUtils method assertSplitAtFractionConcurrent.

private static <T> boolean assertSplitAtFractionConcurrent(ExecutorService executor, BoundedSource<T> source, List<T> expectedItems, final int numItemsToReadBeforeSplitting, final double fraction, PipelineOptions options) throws Exception {
    // Closed in readerThread
    @SuppressWarnings("resource") final BoundedSource.BoundedReader<T> reader = source.createReader(options);
    final CountDownLatch unblockSplitter = new CountDownLatch(1);
    Future<List<T>> readerThread = executor.submit(new Callable<List<T>>() {

        @Override
        public List<T> call() throws Exception {
            try {
                List<T> items = readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplitting);
                unblockSplitter.countDown();
                items.addAll(readRemainingFromReader(reader, numItemsToReadBeforeSplitting > 0));
                return items;
            } finally {
                reader.close();
            }
        }
    });
    Future<KV<BoundedSource<T>, BoundedSource<T>>> splitterThread = executor.submit(new Callable<KV<BoundedSource<T>, BoundedSource<T>>>() {

        @Override
        public KV<BoundedSource<T>, BoundedSource<T>> call() throws Exception {
            unblockSplitter.await();
            BoundedSource<T> residual = reader.splitAtFraction(fraction);
            if (residual == null) {
                return null;
            }
            return KV.of(reader.getCurrentSource(), residual);
        }
    });
    List<T> currentItems = readerThread.get();
    KV<BoundedSource<T>, BoundedSource<T>> splitSources = splitterThread.get();
    if (splitSources == null) {
        return false;
    }
    SplitAtFractionResult res = verifySingleSplitAtFractionResult(source, expectedItems, currentItems, splitSources.getKey(), splitSources.getValue(), numItemsToReadBeforeSplitting, fraction, options);
    return (res.numResidualItems > 0);
}
Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) KV(org.apache.beam.sdk.values.KV) CountDownLatch(java.util.concurrent.CountDownLatch) NoSuchElementException(java.util.NoSuchElementException) IOException(java.io.IOException) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List)

Example 27 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WithKeys method expand.

@Override
public PCollection<KV<K, V>> expand(PCollection<V> in) {
    PCollection<KV<K, V>> result = in.apply("AddKeys", MapElements.via(new SimpleFunction<V, KV<K, V>>() {

        @Override
        public KV<K, V> apply(V element) {
            return KV.of(fn.apply(element), element);
        }
    }));
    try {
        Coder<K> keyCoder;
        CoderRegistry coderRegistry = in.getPipeline().getCoderRegistry();
        if (keyClass == null) {
            keyCoder = coderRegistry.getOutputCoder(fn, in.getCoder());
        } else {
            keyCoder = coderRegistry.getCoder(TypeDescriptor.of(keyClass));
        }
        // TODO: Remove when we can set the coder inference context.
        result.setCoder(KvCoder.of(keyCoder, in.getCoder()));
    } catch (CannotProvideCoderException exc) {
    // let lazy coder inference have a try
    }
    return result;
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) CannotProvideCoderException(org.apache.beam.sdk.coders.CannotProvideCoderException) KV(org.apache.beam.sdk.values.KV) KV(org.apache.beam.sdk.values.KV)

Example 28 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class GroupByKeyTest method testTimestampCombinerLatest.

/**
   * Tests that when two elements are combined via a GroupByKey their output timestamp agrees
   * with the windowing function customized to use the latest value.
   */
@Test
@Category(ValidatesRunner.class)
public void testTimestampCombinerLatest() {
    p.apply(Create.timestamped(TimestampedValue.of(KV.of(0, "hello"), new Instant(0)), TimestampedValue.of(KV.of(0, "goodbye"), new Instant(10)))).apply(Window.<KV<Integer, String>>into(FixedWindows.of(Duration.standardMinutes(10))).withTimestampCombiner(TimestampCombiner.LATEST)).apply(GroupByKey.<Integer, String>create()).apply(ParDo.of(new AssertTimestamp(new Instant(10))));
    p.run();
}
Also used : Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 29 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class GroupIntoBatchesTest method testInStreamingMode.

@Test
@Category({ NeedsRunner.class, UsesTimersInParDo.class, UsesTestStream.class, UsesStatefulParDo.class })
public void testInStreamingMode() {
    int timestampInterval = 1;
    Instant startInstant = new Instant(0L);
    TestStream.Builder<KV<String, String>> streamBuilder = TestStream.create(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())).advanceWatermarkTo(startInstant);
    long offset = 0L;
    for (KV<String, String> element : data) {
        streamBuilder = streamBuilder.addElements(TimestampedValue.of(element, startInstant.plus(Duration.standardSeconds(offset * timestampInterval))));
        offset++;
    }
    final long windowDuration = 6;
    TestStream<KV<String, String>> stream = streamBuilder.advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(windowDuration - 1))).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(windowDuration + 1))).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(NUM_ELEMENTS))).advanceWatermarkToInfinity();
    PCollection<KV<String, String>> inputCollection = pipeline.apply(stream).apply(Window.<KV<String, String>>into(FixedWindows.of(Duration.standardSeconds(windowDuration))).withAllowedLateness(Duration.millis(ALLOWED_LATENESS)));
    inputCollection.apply(ParDo.of(new DoFn<KV<String, String>, Void>() {

        @ProcessElement
        public void processElement(ProcessContext c, BoundedWindow window) {
            LOG.debug("*** ELEMENT: ({},{}) *** with timestamp %s in window %s", c.element().getKey(), c.element().getValue(), c.timestamp().toString(), window.toString());
        }
    }));
    PCollection<KV<String, Iterable<String>>> outputCollection = inputCollection.apply(GroupIntoBatches.<String, String>ofSize(BATCH_SIZE)).setCoder(KvCoder.of(StringUtf8Coder.of(), IterableCoder.of(StringUtf8Coder.of())));
    // elements have the same key and collection is divided into windows,
    // so Count.perKey values are the number of elements in windows
    PCollection<KV<String, Long>> countOutput = outputCollection.apply("Count elements in windows after applying GroupIntoBatches", Count.<String, Iterable<String>>perKey());
    PAssert.that("Wrong number of elements in windows after GroupIntoBatches", countOutput).satisfies(new SerializableFunction<Iterable<KV<String, Long>>, Void>() {

        @Override
        public Void apply(Iterable<KV<String, Long>> input) {
            Iterator<KV<String, Long>> inputIterator = input.iterator();
            // first element
            long count0 = inputIterator.next().getValue();
            // window duration is 6 and batch size is 5, so there should be 2 elements in the
            // window (flush because batchSize reached and for end of window reached)
            assertEquals("Wrong number of elements in first window", 2, count0);
            // second element
            long count1 = inputIterator.next().getValue();
            // collection is 10 elements, there is only 4 elements left, so there should be only
            // one element in the window (flush because end of window/collection reached)
            assertEquals("Wrong number of elements in second window", 1, count1);
            // third element
            return null;
        }
    });
    PAssert.that("Incorrect output collection after GroupIntoBatches", outputCollection).satisfies(new SerializableFunction<Iterable<KV<String, Iterable<String>>>, Void>() {

        @Override
        public Void apply(Iterable<KV<String, Iterable<String>>> input) {
            Iterator<KV<String, Iterable<String>>> inputIterator = input.iterator();
            // first element
            int size0 = Iterables.size(inputIterator.next().getValue());
            // window duration is 6 and batch size is 5, so output batch size should de 5
            // (flush because of batchSize reached)
            assertEquals("Wrong first element batch Size", 5, size0);
            // second element
            int size1 = Iterables.size(inputIterator.next().getValue());
            // there is only one element left in the window so batch size should be 1
            // (flush because of end of window reached)
            assertEquals("Wrong second element batch Size", 1, size1);
            // third element
            int size2 = Iterables.size(inputIterator.next().getValue());
            // collection is 10 elements, there is only 4 left, so batch size should be 4
            // (flush because end of collection reached)
            assertEquals("Wrong third element batch Size", 4, size2);
            return null;
        }
    });
    pipeline.run().waitUntilFinish();
}
Also used : Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Iterator(java.util.Iterator) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) UsesTestStream(org.apache.beam.sdk.testing.UsesTestStream) TestStream(org.apache.beam.sdk.testing.TestStream) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 30 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class ParDoTest method testValueStateFixedWindows.

@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testValueStateFixedWindows() {
    final String stateId = "foo";
    DoFn<KV<String, Integer>, Integer> fn = new DoFn<KV<String, Integer>, Integer>() {

        @StateId(stateId)
        private final StateSpec<ValueState<Integer>> intState = StateSpecs.value(VarIntCoder.of());

        @ProcessElement
        public void processElement(ProcessContext c, @StateId(stateId) ValueState<Integer> state) {
            Integer currentValue = MoreObjects.firstNonNull(state.read(), 0);
            c.output(currentValue);
            state.write(currentValue + 1);
        }
    };
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(10));
    IntervalWindow secondWindow = new IntervalWindow(new Instant(10), new Instant(20));
    PCollection<Integer> output = pipeline.apply(Create.timestamped(// first window
    TimestampedValue.of(KV.of("hello", 7), new Instant(1)), TimestampedValue.of(KV.of("hello", 14), new Instant(2)), TimestampedValue.of(KV.of("hello", 21), new Instant(3)), // second window
    TimestampedValue.of(KV.of("hello", 28), new Instant(11)), TimestampedValue.of(KV.of("hello", 35), new Instant(13)))).apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.millis(10)))).apply("Stateful ParDo", ParDo.of(fn));
    PAssert.that(output).inWindow(firstWindow).containsInAnyOrder(0, 1, 2);
    PAssert.that(output).inWindow(secondWindow).containsInAnyOrder(0, 1);
    pipeline.run();
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) ValueState(org.apache.beam.sdk.state.ValueState) Instant(org.joda.time.Instant) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Matchers.containsString(org.hamcrest.Matchers.containsString) KV(org.apache.beam.sdk.values.KV) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12