use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class ChunkingShuffleBatchReader method read.
@Override
public ShuffleBatchReader.Batch read(@Nullable ShufflePosition startShufflePosition, @Nullable ShufflePosition endShufflePosition) throws IOException {
byte @Nullable [] startPosition = ByteArrayShufflePosition.getPosition(startShufflePosition);
byte @Nullable [] endPosition = ByteArrayShufflePosition.getPosition(endShufflePosition);
ShuffleReader.ReadChunkResult result;
try (Closeable trackedReadState = tracker.enterState(readState)) {
result = reader.readIncludingPosition(startPosition, endPosition);
}
DataInputStream input = new DataInputStream(new ByteArrayInputStream(result.chunk));
ArrayList<ShuffleEntry> entries = new ArrayList<>();
while (input.available() > 0) {
entries.add(getShuffleEntry(input));
}
return new Batch(entries, result.nextStartPosition == null ? null : ByteArrayShufflePosition.of(result.nextStartPosition));
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method testGetApproximateProgress.
@Test
public void testGetApproximateProgress() throws Exception {
// Store the positions of all KVs returned.
List<ByteArrayShufflePosition> positionsList = new ArrayList<>();
PipelineOptions options = PipelineOptionsFactory.create();
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
TestShuffleReader shuffleReader = new TestShuffleReader();
final int kNumRecords = 10;
for (int i = 0; i < kNumRecords; ++i) {
ByteArrayShufflePosition position = fabricatePosition(i);
byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
positionsList.add(position);
ShuffleEntry entry = new ShuffleEntry(position, keyByte, EMPTY_BYTE_ARRAY, keyByte);
shuffleReader.addEntry(entry);
}
assertFalse(shuffleReader.isClosed());
try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
Integer i = 0;
for (boolean more = iter.start(); more; more = iter.advance()) {
ApproximateReportedProgress progress = readerProgressToCloudProgress(iter.getProgress());
assertNotNull(progress.getPosition().getShufflePosition());
// Compare returned position with the expected position.
assertEquals(positionsList.get(i).encodeBase64(), progress.getPosition().getShufflePosition());
WindowedValue<KV<Integer, Reiterable<Integer>>> elem = iter.getCurrent();
assertEquals(i, elem.getValue().getKey());
i++;
}
assertFalse(iter.advance());
// Cannot split since all input was consumed.
Position proposedSplitPosition = new Position();
String stop = encodeBase64URLSafeString(fabricatePosition(0).getPosition());
proposedSplitPosition.setShufflePosition(stop);
assertNull(iter.requestDynamicSplit(toDynamicSplitRequest(approximateSplitRequestAtPosition(proposedSplitPosition))));
}
assertTrue(shuffleReader.isClosed());
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method runTestReadFromShuffle.
private void runTestReadFromShuffle(List<KV<Integer, List<KV<Integer, Integer>>>> input, boolean sortValues, ValuesToRead valuesToRead) throws Exception {
Coder<WindowedValue<KV<Integer, Iterable<KV<Integer, Integer>>>>> sourceElemCoder = WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(KvCoder.of(BigEndianIntegerCoder.of(), BigEndianIntegerCoder.of()))), IntervalWindow.getCoder());
List<ShuffleEntry> records = writeShuffleEntries(input, sortValues);
PipelineOptions options = PipelineOptionsFactory.create();
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
GroupingShuffleReader<Integer, KV<Integer, Integer>> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, sourceElemCoder, context, TestOperationContext.create(), ShuffleReadCounterFactory.INSTANCE, sortValues);
ExecutorTestUtils.TestReaderObserver observer = new ExecutorTestUtils.TestReaderObserver(groupingShuffleReader);
TestShuffleReader shuffleReader = new TestShuffleReader();
List<Integer> expectedSizes = new ArrayList<>();
for (ShuffleEntry record : records) {
expectedSizes.add(record.length());
shuffleReader.addEntry(record);
}
List<KV<Integer, List<KV<Integer, Integer>>>> actual = runIterationOverGroupingShuffleReader(context, shuffleReader, groupingShuffleReader, sourceElemCoder, valuesToRead);
List<KV<Integer, List<KV<Integer, Integer>>>> expected = new ArrayList<>();
for (KV<Integer, List<KV<Integer, Integer>>> kvs : input) {
Integer key = kvs.getKey();
List<KV<Integer, Integer>> values = new ArrayList<>();
if (valuesToRead.ordinal() >= ValuesToRead.READ_ONE_VALUE.ordinal()) {
for (KV<Integer, Integer> value : kvs.getValue()) {
values.add(value);
if (valuesToRead == ValuesToRead.READ_ONE_VALUE) {
break;
}
}
}
expected.add(KV.of(key, values));
}
assertEquals(expected, actual);
assertEquals(expectedSizes, observer.getActualSizes());
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class GroupingShuffleReaderTest method testShuffleReadCounterMultipleExecutingSteps.
@Test
public void testShuffleReadCounterMultipleExecutingSteps() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
options.as(DataflowPipelineDebugOptions.class).setExperiments(Lists.newArrayList(Experiment.IntertransformIO.getName()));
BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
final int kFirstShard = 0;
TestShuffleReader shuffleReader = new TestShuffleReader();
final int kNumRecords = 10;
for (int i = 0; i < kNumRecords; ++i) {
byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
shuffleReader.addEntry(new ShuffleEntry(fabricatePosition(kFirstShard, key), key, EMPTY_BYTE_ARRAY, key));
}
TestShuffleReadCounterFactory shuffleReadCounterFactory = new TestShuffleReadCounterFactory();
// Note that TestShuffleReader start/end positions are in the
// space of keys not the positions (TODO: should probably always
// use positions instead).
String stop = encodeBase64URLSafeString(fabricatePosition(kNumRecords).getPosition());
TestOperationContext operationContext = TestOperationContext.create();
GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, stop, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, shuffleReadCounterFactory, false);
assertFalse(shuffleReader.isClosed());
try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
// Poke the iterator so we can test dynamic splitting.
assertTrue(iter.start());
// including start() above.
int numRecordsReturned = 1;
for (; iter.advance(); ++numRecordsReturned) {
if (numRecordsReturned > 5) {
setCurrentExecutionState(MOCK_ORIGINAL_NAME_FOR_EXECUTING_STEP2);
}
// ignored
iter.getCurrent().getValue();
}
assertEquals(kNumRecords, numRecordsReturned);
}
assertTrue(shuffleReader.isClosed());
Map<String, Long> expectedReadBytesMap = new HashMap<>();
expectedReadBytesMap.put(MOCK_ORIGINAL_NAME_FOR_EXECUTING_STEP1, 48L);
expectedReadBytesMap.put(MOCK_ORIGINAL_NAME_FOR_EXECUTING_STEP2, 32L);
expectShuffleReadCounterEquals(shuffleReadCounterFactory, expectedReadBytesMap);
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.
the class ShuffleSinkTest method runTestWriteGroupingSortingShuffleSink.
void runTestWriteGroupingSortingShuffleSink(List<KV<Integer, KV<String, Integer>>> expected) throws Exception {
BatchModeExecutionContext executionContext = BatchModeExecutionContext.forTesting(PipelineOptionsFactory.create(), "STAGE");
ShuffleSink<KV<Integer, KV<String, Integer>>> shuffleSink = new ShuffleSink<>(PipelineOptionsFactory.create(), null, ShuffleSink.ShuffleKind.GROUP_KEYS_AND_SORT_VALUES, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())), new GlobalWindows().windowCoder()), executionContext, TestOperationContext.create());
TestShuffleWriter shuffleWriter = new TestShuffleWriter();
List<Long> actualSizes = new ArrayList<>();
try (Sink.SinkWriter<WindowedValue<KV<Integer, KV<String, Integer>>>> shuffleSinkWriter = shuffleSink.writer(shuffleWriter, "dataset")) {
for (KV<Integer, KV<String, Integer>> kv : expected) {
actualSizes.add(shuffleSinkWriter.add(WindowedValue.valueInGlobalWindow(kv)));
}
}
List<ShuffleEntry> records = shuffleWriter.getRecords();
List<KV<Integer, KV<String, Integer>>> actual = new ArrayList<>();
for (ShuffleEntry record : records) {
byte[] keyBytes = record.getKey();
byte[] valueBytes = record.getValue();
byte[] sortKeyBytes = record.getSecondaryKey();
Integer key = CoderUtils.decodeFromByteArray(BigEndianIntegerCoder.of(), keyBytes);
ByteArrayInputStream bais = new ByteArrayInputStream(sortKeyBytes);
String sortKey = StringUtf8Coder.of().decode(bais);
Integer sortValue = CoderUtils.decodeFromByteArray(BigEndianIntegerCoder.of(), valueBytes);
actual.add(KV.of(key, KV.of(sortKey, sortValue)));
}
Assert.assertEquals(expected, actual);
Assert.assertEquals(shuffleWriter.getSizes(), actualSizes);
}
Aggregations