Search in sources :

Example 1 with Reiterable

use of org.apache.beam.sdk.util.common.Reiterable in project beam by apache.

the class GroupingShuffleReaderTest method testReadFromShuffleAndDynamicSplit.

@Test
public void testReadFromShuffleAndDynamicSplit() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 10;
    final int kFirstShard = 0;
    final int kSecondShard = 1;
    // therefore each record comes with a unique position constructed.
    for (int i = 0; i < kNumRecords; ++i) {
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    for (int i = kNumRecords; i < 2 * kNumRecords; ++i) {
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kSecondShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    int i = 0;
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        // Poke the iterator so we can test dynamic splitting.
        assertTrue(iter.start());
        ++i;
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(new Position())));
        // Split at the shard boundary
        NativeReader.DynamicSplitResult dynamicSplitResult = iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kSecondShard, null)));
        assertNotNull(dynamicSplitResult);
        assertEquals(encodeBase64URLSafeString(fabricatePosition(kSecondShard).getPosition()), positionFromSplitResult(dynamicSplitResult).getShufflePosition());
        for (; iter.advance(); ++i) {
            // iter.getCurrent() is supposed to be side-effect-free and give the same result if called
            // repeatedly. Test that this is indeed the case.
            iter.getCurrent();
            iter.getCurrent();
            KV<Integer, Reiterable<Integer>> elem = iter.getCurrent().getValue();
            int key = elem.getKey();
            assertEquals(key, i);
            Reiterable<Integer> valuesIterable = elem.getValue();
            Reiterator<Integer> valuesIterator = valuesIterable.iterator();
            int j = 0;
            while (valuesIterator.hasNext()) {
                assertTrue(valuesIterator.hasNext());
                assertTrue(valuesIterator.hasNext());
                int value = valuesIterator.next();
                assertEquals(value, i);
                ++j;
            }
            assertFalse(valuesIterator.hasNext());
            assertFalse(valuesIterator.hasNext());
            assertEquals(1, j);
        }
        assertFalse(iter.advance());
    }
    assertTrue(shuffleReader.isClosed());
    assertEquals(i, kNumRecords);
    // There are 10 Shuffle records that each encode an integer key (4 bytes) and integer value (4
    // bytes). We therefore expect to read 80 bytes.
    assertEquals(80L, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
}
Also used : Reiterable(org.apache.beam.sdk.util.common.Reiterable) ReaderTestUtils.approximateSplitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition) ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) Position(com.google.api.services.dataflow.model.Position) ReaderTestUtils.splitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) NativeReader(org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Example 2 with Reiterable

use of org.apache.beam.sdk.util.common.Reiterable in project beam by apache.

the class BatchGroupAlsoByWindowViaIteratorsFn method processElement.

@Override
@SuppressWarnings("ReferenceEquality")
public void processElement(KV<K, Iterable<WindowedValue<V>>> element, PipelineOptions options, StepContext stepContext, SideInputReader sideInputReader, OutputWindowedValue<KV<K, Iterable<V>>> output) throws Exception {
    K key = element.getKey();
    // This iterable is required to be in order of increasing timestamps
    Iterable<WindowedValue<V>> value = element.getValue();
    PeekingReiterator<WindowedValue<V>> iterator;
    if (value instanceof Collection) {
        iterator = new PeekingReiterator<>(new ListReiterator<WindowedValue<V>>(new ArrayList<WindowedValue<V>>((Collection<WindowedValue<V>>) value), 0));
    } else if (value instanceof Reiterable) {
        iterator = new PeekingReiterator<>(((Reiterable<WindowedValue<V>>) value).iterator());
    } else {
        throw new IllegalArgumentException("Input to GroupAlsoByWindowsDoFn must be a Collection or Reiterable");
    }
    // This ListMultimap is a map of window maxTimestamps to the list of active
    // windows with that maxTimestamp.
    ListMultimap<Instant, BoundedWindow> windows = ArrayListMultimap.create();
    while (iterator.hasNext()) {
        WindowedValue<V> e = iterator.peek();
        for (BoundedWindow window : e.getWindows()) {
            // corresponding to this window, starting at this element in the input Reiterable.
            if (!windows.containsEntry(window.maxTimestamp(), window)) {
                // This window was produced by strategy.getWindowFn()
                @SuppressWarnings("unchecked") W typedWindow = (W) window;
                // Iterating through the WindowReiterable may advance iterator as an optimization
                // for as long as it detects that there are no new windows.
                windows.put(window.maxTimestamp(), window);
                output.outputWindowedValue(KV.of(key, (Iterable<V>) new WindowReiterable<V>(iterator, window)), strategy.getTimestampCombiner().assign(typedWindow, e.getTimestamp()), Arrays.asList(window), PaneInfo.ON_TIME_AND_ONLY_FIRING);
            }
        }
        // Copy the iterator in case the next DoFn cached its version of the iterator instead
        // of immediately iterating through it.
        // And, only advance the iterator if the consuming operation hasn't done so.
        iterator = iterator.copy();
        if (iterator.hasNext() && iterator.peek() == e) {
            iterator.next();
        }
        // Remove all windows with maxTimestamp behind the current timestamp.
        Iterator<Instant> windowIterator = windows.keys().iterator();
        while (windowIterator.hasNext() && windowIterator.next().isBefore(e.getTimestamp())) {
            windowIterator.remove();
        }
    }
}
Also used : Reiterable(org.apache.beam.sdk.util.common.Reiterable) ElementByteSizeObservableIterable(org.apache.beam.sdk.util.common.ElementByteSizeObservableIterable) Instant(org.joda.time.Instant) WindowedValue(org.apache.beam.sdk.util.WindowedValue) OutputWindowedValue(org.apache.beam.runners.core.OutputWindowedValue) KV(org.apache.beam.sdk.values.KV) PeekingReiterator(org.apache.beam.runners.core.PeekingReiterator) Collection(java.util.Collection) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow)

Aggregations

Reiterable (org.apache.beam.sdk.util.common.Reiterable)2 Position (com.google.api.services.dataflow.model.Position)1 Collection (java.util.Collection)1 OutputWindowedValue (org.apache.beam.runners.core.OutputWindowedValue)1 PeekingReiterator (org.apache.beam.runners.core.PeekingReiterator)1 ReaderTestUtils.approximateSplitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition)1 ReaderTestUtils.splitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition)1 ByteArrayShufflePosition (org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition)1 NativeReader (org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader)1 ShuffleEntry (org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry)1 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)1 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)1 WindowedValue (org.apache.beam.sdk.util.WindowedValue)1 ElementByteSizeObservableIterable (org.apache.beam.sdk.util.common.ElementByteSizeObservableIterable)1 KV (org.apache.beam.sdk.values.KV)1 Instant (org.joda.time.Instant)1 Test (org.junit.Test)1