Search in sources :

Example 1 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class UngroupedShuffleReaderTest method runTestReadFromShuffle.

void runTestReadFromShuffle(List<Integer> expected) throws Exception {
    Coder<WindowedValue<Integer>> elemCoder = WindowedValue.getFullCoder(BigEndianIntegerCoder.of(), IntervalWindow.getCoder());
    BatchModeExecutionContext executionContext = BatchModeExecutionContext.forTesting(PipelineOptionsFactory.create(), "STAGE");
    // Write to shuffle with UNGROUPED ShuffleSink.
    ShuffleSink<Integer> shuffleSink = new ShuffleSink<>(PipelineOptionsFactory.create(), null, ShuffleSink.ShuffleKind.UNGROUPED, elemCoder, executionContext, TestOperationContext.create());
    TestShuffleWriter shuffleWriter = new TestShuffleWriter();
    List<Long> actualSizes = new ArrayList<>();
    try (Sink.SinkWriter<WindowedValue<Integer>> shuffleSinkWriter = shuffleSink.writer(shuffleWriter, "dataset")) {
        for (Integer value : expected) {
            actualSizes.add(shuffleSinkWriter.add(WindowedValue.of(value, timestamp, Lists.newArrayList(window), PaneInfo.NO_FIRING)));
        }
    }
    List<ShuffleEntry> records = shuffleWriter.getRecords();
    Assert.assertEquals(expected.size(), records.size());
    Assert.assertEquals(shuffleWriter.getSizes(), actualSizes);
    // Read from shuffle with UngroupedShuffleReader.
    UngroupedShuffleReader<WindowedValue<Integer>> ungroupedShuffleReader = new UngroupedShuffleReader<>(PipelineOptionsFactory.create(), null, null, null, elemCoder, executionContext, TestOperationContext.create());
    ExecutorTestUtils.TestReaderObserver observer = new ExecutorTestUtils.TestReaderObserver(ungroupedShuffleReader);
    TestShuffleReader shuffleReader = new TestShuffleReader();
    List<Integer> expectedSizes = new ArrayList<>();
    for (ShuffleEntry record : records) {
        expectedSizes.add(record.length());
        shuffleReader.addEntry(record);
    }
    List<Integer> actual = new ArrayList<>();
    Assert.assertFalse(shuffleReader.isClosed());
    try (UngroupedShuffleReaderIterator<WindowedValue<Integer>> iter = ungroupedShuffleReader.iterator(shuffleReader)) {
        for (boolean more = iter.start(); more; more = iter.advance()) {
            WindowedValue<Integer> elem = iter.getCurrent();
            Assert.assertEquals(timestamp, elem.getTimestamp());
            Assert.assertEquals(Lists.newArrayList(window), elem.getWindows());
            actual.add(elem.getValue());
        }
        Assert.assertFalse(iter.advance());
        try {
            iter.getCurrent();
            Assert.fail("should have failed");
        } catch (NoSuchElementException exn) {
        // As expected.
        }
    }
    Assert.assertTrue(shuffleReader.isClosed());
    Assert.assertEquals(expected, actual);
    Assert.assertEquals(expectedSizes, observer.getActualSizes());
}
Also used : ExecutorTestUtils(org.apache.beam.runners.dataflow.worker.util.common.worker.ExecutorTestUtils) ArrayList(java.util.ArrayList) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) Sink(org.apache.beam.runners.dataflow.worker.util.common.worker.Sink) WindowedValue(org.apache.beam.sdk.util.WindowedValue) NoSuchElementException(java.util.NoSuchElementException)

Example 2 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method testReadFromShuffleDataAndFailToSplit.

@Test
public void testReadFromShuffleDataAndFailToSplit() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    final int kFirstShard = 0;
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 2;
    for (int i = 0; i < kNumRecords; ++i) {
        byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        shuffleReader.addEntry(new ShuffleEntry(fabricatePosition(kFirstShard, key), key, EMPTY_BYTE_ARRAY, key));
    }
    // Note that TestShuffleReader start/end positions are in the
    // space of keys not the positions (TODO: should probably always
    // use positions instead).
    String stop = encodeBase64URLSafeString(fabricatePosition(kNumRecords).getPosition());
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, stop, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        // Poke the iterator so we can test dynamic splitting.
        assertTrue(iter.start());
        // Cannot split since the value provided is past the current stop position.
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kNumRecords + 1, null))));
        byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), 0);
        // Cannot split since the split position is identical with the position of the record
        // that was just returned.
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kFirstShard, key))));
        // Cannot split since the requested split position comes before current position
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kFirstShard, null))));
        // including start() above.
        int numRecordsReturned = 1;
        for (; iter.advance(); ++numRecordsReturned) {
            // ignored
            iter.getCurrent().getValue();
        }
        assertEquals(kNumRecords, numRecordsReturned);
        // Cannot split since all input was consumed.
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kFirstShard, null))));
    }
    assertTrue(shuffleReader.isClosed());
}
Also used : ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Base64.encodeBase64URLSafeString(com.google.api.client.util.Base64.encodeBase64URLSafeString) Test(org.junit.Test)

Example 3 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method runTestBytesReadCounterForOptions.

private void runTestBytesReadCounterForOptions(PipelineOptions options, List<KV<Integer, List<KV<Integer, Integer>>>> input, boolean useSecondaryKey, ValuesToRead valuesToRead, long expectedReadBytes) throws Exception {
    // Create a shuffle reader with the shuffle values provided as input.
    List<ShuffleEntry> records = writeShuffleEntries(input, useSecondaryKey);
    TestShuffleReader shuffleReader = new TestShuffleReader();
    for (ShuffleEntry record : records) {
        shuffleReader.addEntry(record);
    }
    TestShuffleReadCounterFactory shuffleReadCounterFactory = new TestShuffleReadCounterFactory();
    Coder<WindowedValue<KV<Integer, Iterable<KV<Integer, Integer>>>>> sourceElemCoder = WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(KvCoder.of(BigEndianIntegerCoder.of(), BigEndianIntegerCoder.of()))), IntervalWindow.getCoder());
    // Read from shuffle with GroupingShuffleReader.
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, KV<Integer, Integer>> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, sourceElemCoder, context, operationContext, shuffleReadCounterFactory, useSecondaryKey);
    groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
    runIterationOverGroupingShuffleReader(context, shuffleReader, groupingShuffleReader, sourceElemCoder, valuesToRead);
    if (ExperimentContext.parseFrom(options).isEnabled(Experiment.IntertransformIO)) {
        expectShuffleReadCounterEquals(shuffleReadCounterFactory, expectedReadBytes);
    } else {
        assertEquals(expectedReadBytes, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
    }
}
Also used : KV(org.apache.beam.sdk.values.KV) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Example 4 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method testConsumedParallelism.

@Test
public void testConsumedParallelism() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    final int kFirstShard = 0;
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 5;
    for (int i = 0; i < kNumRecords; ++i) {
        byte[] key = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, i), key, EMPTY_BYTE_ARRAY, key);
        shuffleReader.addEntry(entry);
    }
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        // Iterator hasn't started; consumed parallelism is 0.
        assertEquals(0.0, consumedParallelismFromProgress(iter.getProgress()), 0);
        // The only way to set a stop *position* in tests is via a split. To do that,
        // we must call hasNext() first.
        // Should return entry at key 0.
        assertTrue(iter.start());
        // Iterator just started; consumed parallelism is 0.
        assertEquals(0.0, readerProgressToCloudProgress(iter.getProgress()).getConsumedParallelism().getValue(), 0);
        assertNotNull(iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(fabricatePosition(kFirstShard, 2).immediateSuccessor().getPosition()))));
        // Split does not affect consumed parallelism; consumed parallelism is still 0.
        assertEquals(0.0, consumedParallelismFromProgress(iter.getProgress()), 0);
        // Should return entry at key 1.
        assertTrue(iter.advance());
        assertEquals(1.0, consumedParallelismFromProgress(iter.getProgress()), 0);
        // Should return entry at key 2 (last key, because the stop position
        // is its immediate successor.) Consumed parallelism increments by one to 2.
        assertTrue(iter.advance());
        assertEquals(2.0, consumedParallelismFromProgress(iter.getProgress()), 0);
        // Iterator advanced by one and consumes one more split point (total consumed: 3).
        assertFalse(iter.advance());
        assertEquals(3.0, consumedParallelismFromProgress(iter.getProgress()), 0);
    }
    assertTrue(shuffleReader.isClosed());
}
Also used : ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Example 5 with ShuffleEntry

use of org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry in project beam by apache.

the class GroupingShuffleReaderTest method testReadFromShuffleAndDynamicSplit.

@Test
public void testReadFromShuffleAndDynamicSplit() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 10;
    final int kFirstShard = 0;
    final int kSecondShard = 1;
    // therefore each record comes with a unique position constructed.
    for (int i = 0; i < kNumRecords; ++i) {
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    for (int i = kNumRecords; i < 2 * kNumRecords; ++i) {
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kSecondShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    int i = 0;
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        // Poke the iterator so we can test dynamic splitting.
        assertTrue(iter.start());
        ++i;
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(new Position())));
        // Split at the shard boundary
        NativeReader.DynamicSplitResult dynamicSplitResult = iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kSecondShard, null)));
        assertNotNull(dynamicSplitResult);
        assertEquals(encodeBase64URLSafeString(fabricatePosition(kSecondShard).getPosition()), positionFromSplitResult(dynamicSplitResult).getShufflePosition());
        for (; iter.advance(); ++i) {
            // iter.getCurrent() is supposed to be side-effect-free and give the same result if called
            // repeatedly. Test that this is indeed the case.
            iter.getCurrent();
            iter.getCurrent();
            KV<Integer, Reiterable<Integer>> elem = iter.getCurrent().getValue();
            int key = elem.getKey();
            assertEquals(key, i);
            Reiterable<Integer> valuesIterable = elem.getValue();
            Reiterator<Integer> valuesIterator = valuesIterable.iterator();
            int j = 0;
            while (valuesIterator.hasNext()) {
                assertTrue(valuesIterator.hasNext());
                assertTrue(valuesIterator.hasNext());
                int value = valuesIterator.next();
                assertEquals(value, i);
                ++j;
            }
            assertFalse(valuesIterator.hasNext());
            assertFalse(valuesIterator.hasNext());
            assertEquals(1, j);
        }
        assertFalse(iter.advance());
    }
    assertTrue(shuffleReader.isClosed());
    assertEquals(i, kNumRecords);
    // There are 10 Shuffle records that each encode an integer key (4 bytes) and integer value (4
    // bytes). We therefore expect to read 80 bytes.
    assertEquals(80L, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
}
Also used : Reiterable(org.apache.beam.sdk.util.common.Reiterable) ReaderTestUtils.approximateSplitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition) ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) Position(com.google.api.services.dataflow.model.Position) ReaderTestUtils.splitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) NativeReader(org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Aggregations

ShuffleEntry (org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry)15 ArrayList (java.util.ArrayList)9 WindowedValue (org.apache.beam.sdk.util.WindowedValue)7 KV (org.apache.beam.sdk.values.KV)7 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)6 Test (org.junit.Test)5 Sink (org.apache.beam.runners.dataflow.worker.util.common.worker.Sink)4 Base64.encodeBase64URLSafeString (com.google.api.client.util.Base64.encodeBase64URLSafeString)3 ByteArrayInputStream (java.io.ByteArrayInputStream)3 ExecutorTestUtils (org.apache.beam.runners.dataflow.worker.util.common.worker.ExecutorTestUtils)3 Position (com.google.api.services.dataflow.model.Position)2 DataInputStream (java.io.DataInputStream)2 NoSuchElementException (java.util.NoSuchElementException)2 ReaderTestUtils.approximateSplitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition)2 ReaderTestUtils.splitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition)2 ByteArrayShufflePosition (org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition)2 GlobalWindows (org.apache.beam.sdk.transforms.windowing.GlobalWindows)2 ApproximateReportedProgress (com.google.api.services.dataflow.model.ApproximateReportedProgress)1 Closeable (java.io.Closeable)1 HashMap (java.util.HashMap)1