use of org.apache.beam.sdk.util.common.Reiterable in project beam by apache.
the class GroupingShuffleReaderTest method testReadFromShuffleAndDynamicSplit.
@Test
public void testReadFromShuffleAndDynamicSplit() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
TestShuffleReader shuffleReader = new TestShuffleReader();
final int kNumRecords = 10;
final int kFirstShard = 0;
final int kSecondShard = 1;
// therefore each record comes with a unique position constructed.
for (int i = 0; i < kNumRecords; ++i) {
byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
shuffleReader.addEntry(entry);
}
for (int i = kNumRecords; i < 2 * kNumRecords; ++i) {
byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kSecondShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
shuffleReader.addEntry(entry);
}
int i = 0;
assertFalse(shuffleReader.isClosed());
try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
// Poke the iterator so we can test dynamic splitting.
assertTrue(iter.start());
++i;
assertNull(iter.requestDynamicSplit(splitRequestAtPosition(new Position())));
// Split at the shard boundary
NativeReader.DynamicSplitResult dynamicSplitResult = iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kSecondShard, null)));
assertNotNull(dynamicSplitResult);
assertEquals(encodeBase64URLSafeString(fabricatePosition(kSecondShard).getPosition()), positionFromSplitResult(dynamicSplitResult).getShufflePosition());
for (; iter.advance(); ++i) {
// iter.getCurrent() is supposed to be side-effect-free and give the same result if called
// repeatedly. Test that this is indeed the case.
iter.getCurrent();
iter.getCurrent();
KV<Integer, Reiterable<Integer>> elem = iter.getCurrent().getValue();
int key = elem.getKey();
assertEquals(key, i);
Reiterable<Integer> valuesIterable = elem.getValue();
Reiterator<Integer> valuesIterator = valuesIterable.iterator();
int j = 0;
while (valuesIterator.hasNext()) {
assertTrue(valuesIterator.hasNext());
assertTrue(valuesIterator.hasNext());
int value = valuesIterator.next();
assertEquals(value, i);
++j;
}
assertFalse(valuesIterator.hasNext());
assertFalse(valuesIterator.hasNext());
assertEquals(1, j);
}
assertFalse(iter.advance());
}
assertTrue(shuffleReader.isClosed());
assertEquals(i, kNumRecords);
// There are 10 Shuffle records that each encode an integer key (4 bytes) and integer value (4
// bytes). We therefore expect to read 80 bytes.
assertEquals(80L, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
}
use of org.apache.beam.sdk.util.common.Reiterable in project beam by apache.
the class BatchGroupAlsoByWindowViaIteratorsFn method processElement.
@Override
@SuppressWarnings("ReferenceEquality")
public void processElement(KV<K, Iterable<WindowedValue<V>>> element, PipelineOptions options, StepContext stepContext, SideInputReader sideInputReader, OutputWindowedValue<KV<K, Iterable<V>>> output) throws Exception {
K key = element.getKey();
// This iterable is required to be in order of increasing timestamps
Iterable<WindowedValue<V>> value = element.getValue();
PeekingReiterator<WindowedValue<V>> iterator;
if (value instanceof Collection) {
iterator = new PeekingReiterator<>(new ListReiterator<WindowedValue<V>>(new ArrayList<WindowedValue<V>>((Collection<WindowedValue<V>>) value), 0));
} else if (value instanceof Reiterable) {
iterator = new PeekingReiterator<>(((Reiterable<WindowedValue<V>>) value).iterator());
} else {
throw new IllegalArgumentException("Input to GroupAlsoByWindowsDoFn must be a Collection or Reiterable");
}
// This ListMultimap is a map of window maxTimestamps to the list of active
// windows with that maxTimestamp.
ListMultimap<Instant, BoundedWindow> windows = ArrayListMultimap.create();
while (iterator.hasNext()) {
WindowedValue<V> e = iterator.peek();
for (BoundedWindow window : e.getWindows()) {
// corresponding to this window, starting at this element in the input Reiterable.
if (!windows.containsEntry(window.maxTimestamp(), window)) {
// This window was produced by strategy.getWindowFn()
@SuppressWarnings("unchecked") W typedWindow = (W) window;
// Iterating through the WindowReiterable may advance iterator as an optimization
// for as long as it detects that there are no new windows.
windows.put(window.maxTimestamp(), window);
output.outputWindowedValue(KV.of(key, (Iterable<V>) new WindowReiterable<V>(iterator, window)), strategy.getTimestampCombiner().assign(typedWindow, e.getTimestamp()), Arrays.asList(window), PaneInfo.ON_TIME_AND_ONLY_FIRING);
}
}
// Copy the iterator in case the next DoFn cached its version of the iterator instead
// of immediately iterating through it.
// And, only advance the iterator if the consuming operation hasn't done so.
iterator = iterator.copy();
if (iterator.hasNext() && iterator.peek() == e) {
iterator.next();
}
// Remove all windows with maxTimestamp behind the current timestamp.
Iterator<Instant> windowIterator = windows.keys().iterator();
while (windowIterator.hasNext() && windowIterator.next().isBefore(e.getTimestamp())) {
windowIterator.remove();
}
}
}
Aggregations