Search in sources :

Example 6 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class ChunkingShuffleBatchReader method read.

@Override
public ShuffleBatchReader.Batch read(@Nullable ShufflePosition startShufflePosition, @Nullable ShufflePosition endShufflePosition) throws IOException {
    byte @Nullable [] startPosition = ByteArrayShufflePosition.getPosition(startShufflePosition);
    byte @Nullable [] endPosition = ByteArrayShufflePosition.getPosition(endShufflePosition);
    ShuffleReader.ReadChunkResult result;
    try (Closeable trackedReadState = tracker.enterState(readState)) {
        result = reader.readIncludingPosition(startPosition, endPosition);
    }
    DataInputStream input = new DataInputStream(new ByteArrayInputStream(result.chunk));
    ArrayList<ShuffleEntry> entries = new ArrayList<>();
    while (input.available() > 0) {
        entries.add(getShuffleEntry(input));
    }
    return new Batch(entries, result.nextStartPosition == null ? null : ByteArrayShufflePosition.of(result.nextStartPosition));
}
Also used : ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) ByteArrayInputStream(java.io.ByteArrayInputStream) Closeable(java.io.Closeable) ArrayList(java.util.ArrayList) DataInputStream(java.io.DataInputStream) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Example 7 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method testGetApproximateProgress.

@Test
public void testGetApproximateProgress() throws Exception {
    // Store the positions of all KVs returned.
    List<ByteArrayShufflePosition> positionsList = new ArrayList<>();
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 10;
    for (int i = 0; i < kNumRecords; ++i) {
        ByteArrayShufflePosition position = fabricatePosition(i);
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        positionsList.add(position);
        ShuffleEntry entry = new ShuffleEntry(position, keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        Integer i = 0;
        for (boolean more = iter.start(); more; more = iter.advance()) {
            ApproximateReportedProgress progress = readerProgressToCloudProgress(iter.getProgress());
            assertNotNull(progress.getPosition().getShufflePosition());
            // Compare returned position with the expected position.
            assertEquals(positionsList.get(i).encodeBase64(), progress.getPosition().getShufflePosition());
            WindowedValue<KV<Integer, Reiterable<Integer>>> elem = iter.getCurrent();
            assertEquals(i, elem.getValue().getKey());
            i++;
        }
        assertFalse(iter.advance());
        // Cannot split since all input was consumed.
        Position proposedSplitPosition = new Position();
        String stop = encodeBase64URLSafeString(fabricatePosition(0).getPosition());
        proposedSplitPosition.setShufflePosition(stop);
        assertNull(iter.requestDynamicSplit(toDynamicSplitRequest(approximateSplitRequestAtPosition(proposedSplitPosition))));
    }
    assertTrue(shuffleReader.isClosed());
}
Also used : ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) ReaderTestUtils.approximateSplitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition) ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) Position(com.google.api.services.dataflow.model.Position) ReaderTestUtils.splitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Base64.encodeBase64URLSafeString(com.google.api.client.util.Base64.encodeBase64URLSafeString) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ApproximateReportedProgress(com.google.api.services.dataflow.model.ApproximateReportedProgress) Test(org.junit.Test)

Example 8 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method runTestReadFromShuffle.

private void runTestReadFromShuffle(List<KV<Integer, List<KV<Integer, Integer>>>> input, boolean sortValues, ValuesToRead valuesToRead) throws Exception {
    Coder<WindowedValue<KV<Integer, Iterable<KV<Integer, Integer>>>>> sourceElemCoder = WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(KvCoder.of(BigEndianIntegerCoder.of(), BigEndianIntegerCoder.of()))), IntervalWindow.getCoder());
    List<ShuffleEntry> records = writeShuffleEntries(input, sortValues);
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    GroupingShuffleReader<Integer, KV<Integer, Integer>> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, sourceElemCoder, context, TestOperationContext.create(), ShuffleReadCounterFactory.INSTANCE, sortValues);
    ExecutorTestUtils.TestReaderObserver observer = new ExecutorTestUtils.TestReaderObserver(groupingShuffleReader);
    TestShuffleReader shuffleReader = new TestShuffleReader();
    List<Integer> expectedSizes = new ArrayList<>();
    for (ShuffleEntry record : records) {
        expectedSizes.add(record.length());
        shuffleReader.addEntry(record);
    }
    List<KV<Integer, List<KV<Integer, Integer>>>> actual = runIterationOverGroupingShuffleReader(context, shuffleReader, groupingShuffleReader, sourceElemCoder, valuesToRead);
    List<KV<Integer, List<KV<Integer, Integer>>>> expected = new ArrayList<>();
    for (KV<Integer, List<KV<Integer, Integer>>> kvs : input) {
        Integer key = kvs.getKey();
        List<KV<Integer, Integer>> values = new ArrayList<>();
        if (valuesToRead.ordinal() >= ValuesToRead.READ_ONE_VALUE.ordinal()) {
            for (KV<Integer, Integer> value : kvs.getValue()) {
                values.add(value);
                if (valuesToRead == ValuesToRead.READ_ONE_VALUE) {
                    break;
                }
            }
        }
        expected.add(KV.of(key, values));
    }
    assertEquals(expected, actual);
    assertEquals(expectedSizes, observer.getActualSizes());
}
Also used : ExecutorTestUtils(org.apache.beam.runners.dataflow.worker.util.common.worker.ExecutorTestUtils) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) WindowedValue(org.apache.beam.sdk.util.WindowedValue) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) List(java.util.List) ArrayList(java.util.ArrayList)

Example 9 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method testShuffleReadCounterMultipleExecutingSteps.

@Test
public void testShuffleReadCounterMultipleExecutingSteps() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    options.as(DataflowPipelineDebugOptions.class).setExperiments(Lists.newArrayList(Experiment.IntertransformIO.getName()));
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    final int kFirstShard = 0;
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 10;
    for (int i = 0; i < kNumRecords; ++i) {
        byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        shuffleReader.addEntry(new ShuffleEntry(fabricatePosition(kFirstShard, key), key, EMPTY_BYTE_ARRAY, key));
    }
    TestShuffleReadCounterFactory shuffleReadCounterFactory = new TestShuffleReadCounterFactory();
    // Note that TestShuffleReader start/end positions are in the
    // space of keys not the positions (TODO: should probably always
    // use positions instead).
    String stop = encodeBase64URLSafeString(fabricatePosition(kNumRecords).getPosition());
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, stop, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, shuffleReadCounterFactory, false);
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        // Poke the iterator so we can test dynamic splitting.
        assertTrue(iter.start());
        // including start() above.
        int numRecordsReturned = 1;
        for (; iter.advance(); ++numRecordsReturned) {
            if (numRecordsReturned > 5) {
                setCurrentExecutionState(MOCK_ORIGINAL_NAME_FOR_EXECUTING_STEP2);
            }
            // ignored
            iter.getCurrent().getValue();
        }
        assertEquals(kNumRecords, numRecordsReturned);
    }
    assertTrue(shuffleReader.isClosed());
    Map<String, Long> expectedReadBytesMap = new HashMap<>();
    expectedReadBytesMap.put(MOCK_ORIGINAL_NAME_FOR_EXECUTING_STEP1, 48L);
    expectedReadBytesMap.put(MOCK_ORIGINAL_NAME_FOR_EXECUTING_STEP2, 32L);
    expectShuffleReadCounterEquals(shuffleReadCounterFactory, expectedReadBytesMap);
}
Also used : HashMap(java.util.HashMap) Base64.encodeBase64URLSafeString(com.google.api.client.util.Base64.encodeBase64URLSafeString) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) Test(org.junit.Test)

Example 10 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class ShuffleSinkTest method runTestWriteGroupingSortingShuffleSink.

void runTestWriteGroupingSortingShuffleSink(List<KV<Integer, KV<String, Integer>>> expected) throws Exception {
    BatchModeExecutionContext executionContext = BatchModeExecutionContext.forTesting(PipelineOptionsFactory.create(), "STAGE");
    ShuffleSink<KV<Integer, KV<String, Integer>>> shuffleSink = new ShuffleSink<>(PipelineOptionsFactory.create(), null, ShuffleSink.ShuffleKind.GROUP_KEYS_AND_SORT_VALUES, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())), new GlobalWindows().windowCoder()), executionContext, TestOperationContext.create());
    TestShuffleWriter shuffleWriter = new TestShuffleWriter();
    List<Long> actualSizes = new ArrayList<>();
    try (Sink.SinkWriter<WindowedValue<KV<Integer, KV<String, Integer>>>> shuffleSinkWriter = shuffleSink.writer(shuffleWriter, "dataset")) {
        for (KV<Integer, KV<String, Integer>> kv : expected) {
            actualSizes.add(shuffleSinkWriter.add(WindowedValue.valueInGlobalWindow(kv)));
        }
    }
    List<ShuffleEntry> records = shuffleWriter.getRecords();
    List<KV<Integer, KV<String, Integer>>> actual = new ArrayList<>();
    for (ShuffleEntry record : records) {
        byte[] keyBytes = record.getKey();
        byte[] valueBytes = record.getValue();
        byte[] sortKeyBytes = record.getSecondaryKey();
        Integer key = CoderUtils.decodeFromByteArray(BigEndianIntegerCoder.of(), keyBytes);
        ByteArrayInputStream bais = new ByteArrayInputStream(sortKeyBytes);
        String sortKey = StringUtf8Coder.of().decode(bais);
        Integer sortValue = CoderUtils.decodeFromByteArray(BigEndianIntegerCoder.of(), valueBytes);
        actual.add(KV.of(key, KV.of(sortKey, sortValue)));
    }
    Assert.assertEquals(expected, actual);
    Assert.assertEquals(shuffleWriter.getSizes(), actualSizes);
}
Also used : GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) Sink(org.apache.beam.runners.dataflow.worker.util.common.worker.Sink) ByteArrayInputStream(java.io.ByteArrayInputStream) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Aggregations

ShuffleEntry (org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry)15 ArrayList (java.util.ArrayList)9 WindowedValue (org.apache.beam.sdk.util.WindowedValue)7 KV (org.apache.beam.sdk.values.KV)7 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)6 Test (org.junit.Test)5 Sink (org.apache.beam.runners.dataflow.worker.util.common.worker.Sink)4 Base64.encodeBase64URLSafeString (com.google.api.client.util.Base64.encodeBase64URLSafeString)3 ByteArrayInputStream (java.io.ByteArrayInputStream)3 ExecutorTestUtils (org.apache.beam.runners.dataflow.worker.util.common.worker.ExecutorTestUtils)3 Position (com.google.api.services.dataflow.model.Position)2 DataInputStream (java.io.DataInputStream)2 NoSuchElementException (java.util.NoSuchElementException)2 ReaderTestUtils.approximateSplitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition)2 ReaderTestUtils.splitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition)2 ByteArrayShufflePosition (org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition)2 GlobalWindows (org.apache.beam.sdk.transforms.windowing.GlobalWindows)2 ApproximateReportedProgress (com.google.api.services.dataflow.model.ApproximateReportedProgress)1 Closeable (java.io.Closeable)1 HashMap (java.util.HashMap)1