use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class UngroupedShuffleReaderTest method runTestReadFromShuffle.
void runTestReadFromShuffle(List<Integer> expected) throws Exception {
Coder<WindowedValue<Integer>> elemCoder = WindowedValue.getFullCoder(BigEndianIntegerCoder.of(), IntervalWindow.getCoder());
BatchModeExecutionContext executionContext = BatchModeExecutionContext.forTesting(PipelineOptionsFactory.create(), "STAGE");
// Write to shuffle with UNGROUPED ShuffleSink.
ShuffleSink<Integer> shuffleSink = new ShuffleSink<>(PipelineOptionsFactory.create(), null, ShuffleSink.ShuffleKind.UNGROUPED, elemCoder, executionContext, TestOperationContext.create());
TestShuffleWriter shuffleWriter = new TestShuffleWriter();
List<Long> actualSizes = new ArrayList<>();
try (Sink.SinkWriter<WindowedValue<Integer>> shuffleSinkWriter = shuffleSink.writer(shuffleWriter, "dataset")) {
for (Integer value : expected) {
actualSizes.add(shuffleSinkWriter.add(WindowedValue.of(value, timestamp, Lists.newArrayList(window), PaneInfo.NO_FIRING)));
}
}
List<ShuffleEntry> records = shuffleWriter.getRecords();
Assert.assertEquals(expected.size(), records.size());
Assert.assertEquals(shuffleWriter.getSizes(), actualSizes);
// Read from shuffle with UngroupedShuffleReader.
UngroupedShuffleReader<WindowedValue<Integer>> ungroupedShuffleReader = new UngroupedShuffleReader<>(PipelineOptionsFactory.create(), null, null, null, elemCoder, executionContext, TestOperationContext.create());
ExecutorTestUtils.TestReaderObserver observer = new ExecutorTestUtils.TestReaderObserver(ungroupedShuffleReader);
TestShuffleReader shuffleReader = new TestShuffleReader();
List<Integer> expectedSizes = new ArrayList<>();
for (ShuffleEntry record : records) {
expectedSizes.add(record.length());
shuffleReader.addEntry(record);
}
List<Integer> actual = new ArrayList<>();
Assert.assertFalse(shuffleReader.isClosed());
try (UngroupedShuffleReaderIterator<WindowedValue<Integer>> iter = ungroupedShuffleReader.iterator(shuffleReader)) {
for (boolean more = iter.start(); more; more = iter.advance()) {
WindowedValue<Integer> elem = iter.getCurrent();
Assert.assertEquals(timestamp, elem.getTimestamp());
Assert.assertEquals(Lists.newArrayList(window), elem.getWindows());
actual.add(elem.getValue());
}
Assert.assertFalse(iter.advance());
try {
iter.getCurrent();
Assert.fail("should have failed");
} catch (NoSuchElementException exn) {
// As expected.
}
}
Assert.assertTrue(shuffleReader.isClosed());
Assert.assertEquals(expected, actual);
Assert.assertEquals(expectedSizes, observer.getActualSizes());
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method testReadFromShuffleDataAndFailToSplit.
@Test
public void testReadFromShuffleDataAndFailToSplit() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
final int kFirstShard = 0;
TestShuffleReader shuffleReader = new TestShuffleReader();
final int kNumRecords = 2;
for (int i = 0; i < kNumRecords; ++i) {
byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
shuffleReader.addEntry(new ShuffleEntry(fabricatePosition(kFirstShard, key), key, EMPTY_BYTE_ARRAY, key));
}
// Note that TestShuffleReader start/end positions are in the
// space of keys not the positions (TODO: should probably always
// use positions instead).
String stop = encodeBase64URLSafeString(fabricatePosition(kNumRecords).getPosition());
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, stop, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
assertFalse(shuffleReader.isClosed());
try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
// Poke the iterator so we can test dynamic splitting.
assertTrue(iter.start());
// Cannot split since the value provided is past the current stop position.
assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kNumRecords + 1, null))));
byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), 0);
// Cannot split since the split position is identical with the position of the record
// that was just returned.
assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kFirstShard, key))));
// Cannot split since the requested split position comes before current position
assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kFirstShard, null))));
// including start() above.
int numRecordsReturned = 1;
for (; iter.advance(); ++numRecordsReturned) {
// ignored
iter.getCurrent().getValue();
}
assertEquals(kNumRecords, numRecordsReturned);
// Cannot split since all input was consumed.
assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kFirstShard, null))));
}
assertTrue(shuffleReader.isClosed());
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method runTestBytesReadCounterForOptions.
private void runTestBytesReadCounterForOptions(PipelineOptions options, List<KV<Integer, List<KV<Integer, Integer>>>> input, boolean useSecondaryKey, ValuesToRead valuesToRead, long expectedReadBytes) throws Exception {
// Create a shuffle reader with the shuffle values provided as input.
List<ShuffleEntry> records = writeShuffleEntries(input, useSecondaryKey);
TestShuffleReader shuffleReader = new TestShuffleReader();
for (ShuffleEntry record : records) {
shuffleReader.addEntry(record);
}
TestShuffleReadCounterFactory shuffleReadCounterFactory = new TestShuffleReadCounterFactory();
Coder<WindowedValue<KV<Integer, Iterable<KV<Integer, Integer>>>>> sourceElemCoder = WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(KvCoder.of(BigEndianIntegerCoder.of(), BigEndianIntegerCoder.of()))), IntervalWindow.getCoder());
// Read from shuffle with GroupingShuffleReader.
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, KV<Integer, Integer>> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, sourceElemCoder, context, operationContext, shuffleReadCounterFactory, useSecondaryKey);
groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
runIterationOverGroupingShuffleReader(context, shuffleReader, groupingShuffleReader, sourceElemCoder, valuesToRead);
if (ExperimentContext.parseFrom(options).isEnabled(Experiment.IntertransformIO)) {
expectShuffleReadCounterEquals(shuffleReadCounterFactory, expectedReadBytes);
} else {
assertEquals(expectedReadBytes, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
}
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method testConsumedParallelism.
@Test
public void testConsumedParallelism() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
final int kFirstShard = 0;
TestShuffleReader shuffleReader = new TestShuffleReader();
final int kNumRecords = 5;
for (int i = 0; i < kNumRecords; ++i) {
byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, i), key, EMPTY_BYTE_ARRAY, key);
shuffleReader.addEntry(entry);
}
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
assertFalse(shuffleReader.isClosed());
try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
// Iterator hasn't started; consumed parallelism is 0.
assertEquals(0.0, consumedParallelismFromProgress(iter.getProgress()), 0);
// The only way to set a stop *position* in tests is via a split. To do that,
// we must call hasNext() first.
// Should return entry at key 0.
assertTrue(iter.start());
// Iterator just started; consumed parallelism is 0.
assertEquals(0.0, readerProgressToCloudProgress(iter.getProgress()).getConsumedParallelism().getValue(), 0);
assertNotNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(fabricatePosition(kFirstShard, 2).immediateSuccessor().getPosition()))));
// Split does not affect consumed parallelism; consumed parallelism is still 0.
assertEquals(0.0, consumedParallelismFromProgress(iter.getProgress()), 0);
// Should return entry at key 1.
assertTrue(iter.advance());
assertEquals(1.0, consumedParallelismFromProgress(iter.getProgress()), 0);
// Should return entry at key 2 (last key, because the stop position
// is its immediate successor.) Consumed parallelism increments by one to 2.
assertTrue(iter.advance());
assertEquals(2.0, consumedParallelismFromProgress(iter.getProgress()), 0);
// Iterator advanced by one and consumes one more split point (total consumed: 3).
assertFalse(iter.advance());
assertEquals(3.0, consumedParallelismFromProgress(iter.getProgress()), 0);
}
assertTrue(shuffleReader.isClosed());
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method testReadFromShuffleAndDynamicSplit.
@Test
public void testReadFromShuffleAndDynamicSplit() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
TestShuffleReader shuffleReader = new TestShuffleReader();
final int kNumRecords = 10;
final int kFirstShard = 0;
final int kSecondShard = 1;
// therefore each record comes with a unique position constructed.
for (int i = 0; i < kNumRecords; ++i) {
byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
shuffleReader.addEntry(entry);
}
for (int i = kNumRecords; i < 2 * kNumRecords; ++i) {
byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kSecondShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
shuffleReader.addEntry(entry);
}
int i = 0;
assertFalse(shuffleReader.isClosed());
try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
// Poke the iterator so we can test dynamic splitting.
assertTrue(iter.start());
++i;
assertNull(iter.requestDynamicSplit(splitRequestAtPosition(new Position())));
// Split at the shard boundary
NativeReader.DynamicSplitResult dynamicSplitResult = iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kSecondShard, null)));
assertNotNull(dynamicSplitResult);
assertEquals(encodeBase64URLSafeString(fabricatePosition(kSecondShard).getPosition()), positionFromSplitResult(dynamicSplitResult).getShufflePosition());
for (; iter.advance(); ++i) {
// iter.getCurrent() is supposed to be side-effect-free and give the same result if called
// repeatedly. Test that this is indeed the case.
iter.getCurrent();
iter.getCurrent();
KV<Integer, Reiterable<Integer>> elem = iter.getCurrent().getValue();
int key = elem.getKey();
assertEquals(key, i);
Reiterable<Integer> valuesIterable = elem.getValue();
Reiterator<Integer> valuesIterator = valuesIterable.iterator();
int j = 0;
while (valuesIterator.hasNext()) {
assertTrue(valuesIterator.hasNext());
assertTrue(valuesIterator.hasNext());
int value = valuesIterator.next();
assertEquals(value, i);
++j;
}
assertFalse(valuesIterator.hasNext());
assertFalse(valuesIterator.hasNext());
assertEquals(1, j);
}
assertFalse(iter.advance());
}
assertTrue(shuffleReader.isClosed());
assertEquals(i, kNumRecords);
// There are 10 Shuffle records that each encode an integer key (4 bytes) and integer value (4
// bytes). We therefore expect to read 80 bytes.
assertEquals(80L, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
}
Aggregations