Search in sources :

Example 1 with Position

use of com.google.api.services.dataflow.model.Position in project beam by apache.

the class GroupingShuffleReaderTest method testReadFromShuffleAndDynamicSplit.

@Test
public void testReadFromShuffleAndDynamicSplit() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    groupingShuffleReader.perOperationPerDatasetBytesCounter = operationContext.counterFactory().longSum(CounterName.named("dax-shuffle-test-wf-read-bytes"));
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 10;
    final int kFirstShard = 0;
    final int kSecondShard = 1;
    // therefore each record comes with a unique position constructed.
    for (int i = 0; i < kNumRecords; ++i) {
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kFirstShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    for (int i = kNumRecords; i < 2 * kNumRecords; ++i) {
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        ShuffleEntry entry = new ShuffleEntry(fabricatePosition(kSecondShard, keyByte), keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    int i = 0;
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        // Poke the iterator so we can test dynamic splitting.
        assertTrue(iter.start());
        ++i;
        assertNull(iter.requestDynamicSplit(splitRequestAtPosition(new Position())));
        // Split at the shard boundary
        NativeReader.DynamicSplitResult dynamicSplitResult = iter.requestDynamicSplit(splitRequestAtPosition(makeShufflePosition(kSecondShard, null)));
        assertNotNull(dynamicSplitResult);
        assertEquals(encodeBase64URLSafeString(fabricatePosition(kSecondShard).getPosition()), positionFromSplitResult(dynamicSplitResult).getShufflePosition());
        for (; iter.advance(); ++i) {
            // iter.getCurrent() is supposed to be side-effect-free and give the same result if called
            // repeatedly. Test that this is indeed the case.
            iter.getCurrent();
            iter.getCurrent();
            KV<Integer, Reiterable<Integer>> elem = iter.getCurrent().getValue();
            int key = elem.getKey();
            assertEquals(key, i);
            Reiterable<Integer> valuesIterable = elem.getValue();
            Reiterator<Integer> valuesIterator = valuesIterable.iterator();
            int j = 0;
            while (valuesIterator.hasNext()) {
                assertTrue(valuesIterator.hasNext());
                assertTrue(valuesIterator.hasNext());
                int value = valuesIterator.next();
                assertEquals(value, i);
                ++j;
            }
            assertFalse(valuesIterator.hasNext());
            assertFalse(valuesIterator.hasNext());
            assertEquals(1, j);
        }
        assertFalse(iter.advance());
    }
    assertTrue(shuffleReader.isClosed());
    assertEquals(i, kNumRecords);
    // There are 10 Shuffle records that each encode an integer key (4 bytes) and integer value (4
    // bytes). We therefore expect to read 80 bytes.
    assertEquals(80L, (long) groupingShuffleReader.perOperationPerDatasetBytesCounter.getAggregate());
}
Also used : Reiterable(org.apache.beam.sdk.util.common.Reiterable) ReaderTestUtils.approximateSplitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition) ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) Position(com.google.api.services.dataflow.model.Position) ReaderTestUtils.splitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) NativeReader(org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Example 2 with Position

use of com.google.api.services.dataflow.model.Position in project beam by apache.

the class GroupingShuffleReaderTest method testGetApproximateProgress.

@Test
public void testGetApproximateProgress() throws Exception {
    // Store the positions of all KVs returned.
    List<ByteArrayShufflePosition> positionsList = new ArrayList<>();
    PipelineOptions options = PipelineOptionsFactory.create();
    BatchModeExecutionContext context = BatchModeExecutionContext.forTesting(options, "testStage");
    TestOperationContext operationContext = TestOperationContext.create();
    GroupingShuffleReader<Integer, Integer> groupingShuffleReader = new GroupingShuffleReader<>(options, null, null, null, WindowedValue.getFullCoder(KvCoder.of(BigEndianIntegerCoder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), IntervalWindow.getCoder()), context, operationContext, ShuffleReadCounterFactory.INSTANCE, false);
    TestShuffleReader shuffleReader = new TestShuffleReader();
    final int kNumRecords = 10;
    for (int i = 0; i < kNumRecords; ++i) {
        ByteArrayShufflePosition position = fabricatePosition(i);
        byte[] keyByte = CoderUtils.encodeToByteArray(BigEndianIntegerCoder.of(), i);
        positionsList.add(position);
        ShuffleEntry entry = new ShuffleEntry(position, keyByte, EMPTY_BYTE_ARRAY, keyByte);
        shuffleReader.addEntry(entry);
    }
    assertFalse(shuffleReader.isClosed());
    try (GroupingShuffleReaderIterator<Integer, Integer> iter = groupingShuffleReader.iterator(shuffleReader)) {
        Integer i = 0;
        for (boolean more = iter.start(); more; more = iter.advance()) {
            ApproximateReportedProgress progress = readerProgressToCloudProgress(iter.getProgress());
            assertNotNull(progress.getPosition().getShufflePosition());
            // Compare returned position with the expected position.
            assertEquals(positionsList.get(i).encodeBase64(), progress.getPosition().getShufflePosition());
            WindowedValue<KV<Integer, Reiterable<Integer>>> elem = iter.getCurrent();
            assertEquals(i, elem.getValue().getKey());
            i++;
        }
        assertFalse(iter.advance());
        // Cannot split since all input was consumed.
        Position proposedSplitPosition = new Position();
        String stop = encodeBase64URLSafeString(fabricatePosition(0).getPosition());
        proposedSplitPosition.setShufflePosition(stop);
        assertNull(iter.requestDynamicSplit(toDynamicSplitRequest(approximateSplitRequestAtPosition(proposedSplitPosition))));
    }
    assertTrue(shuffleReader.isClosed());
}
Also used : ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) ReaderTestUtils.approximateSplitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition) ByteArrayShufflePosition(org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition) Position(com.google.api.services.dataflow.model.Position) ReaderTestUtils.splitRequestAtPosition(org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Base64.encodeBase64URLSafeString(com.google.api.client.util.Base64.encodeBase64URLSafeString) ShuffleEntry(org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ApproximateReportedProgress(com.google.api.services.dataflow.model.ApproximateReportedProgress) Test(org.junit.Test)

Example 3 with Position

use of com.google.api.services.dataflow.model.Position in project beam by apache.

the class WorkItemStatusClientTest method populateSplitResultNativeReader.

@Test
public void populateSplitResultNativeReader() throws Exception {
    WorkItemStatus status = new WorkItemStatus();
    statusClient.setWorker(worker, executionContext);
    Position position = ReaderTestUtils.positionAtIndex(42L);
    DynamicSplitResult result = new NativeReader.DynamicSplitResultWithPosition(new DataflowReaderPosition(position));
    statusClient.populateSplitResult(status, result);
    assertThat(status.getStopPosition(), equalTo(position));
    assertThat(status.getDynamicSourceSplit(), nullValue());
}
Also used : WorkItemStatus(com.google.api.services.dataflow.model.WorkItemStatus) DataflowReaderPosition(org.apache.beam.runners.dataflow.worker.SourceTranslationUtils.DataflowReaderPosition) Position(com.google.api.services.dataflow.model.Position) DataflowReaderPosition(org.apache.beam.runners.dataflow.worker.SourceTranslationUtils.DataflowReaderPosition) DynamicSplitResult(org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader.DynamicSplitResult) Test(org.junit.Test)

Aggregations

Position (com.google.api.services.dataflow.model.Position)3 Test (org.junit.Test)3 ReaderTestUtils.approximateSplitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.approximateSplitRequestAtPosition)2 ReaderTestUtils.splitRequestAtPosition (org.apache.beam.runners.dataflow.worker.ReaderTestUtils.splitRequestAtPosition)2 ByteArrayShufflePosition (org.apache.beam.runners.dataflow.worker.util.common.worker.ByteArrayShufflePosition)2 ShuffleEntry (org.apache.beam.runners.dataflow.worker.util.common.worker.ShuffleEntry)2 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)2 Base64.encodeBase64URLSafeString (com.google.api.client.util.Base64.encodeBase64URLSafeString)1 ApproximateReportedProgress (com.google.api.services.dataflow.model.ApproximateReportedProgress)1 WorkItemStatus (com.google.api.services.dataflow.model.WorkItemStatus)1 ArrayList (java.util.ArrayList)1 DataflowReaderPosition (org.apache.beam.runners.dataflow.worker.SourceTranslationUtils.DataflowReaderPosition)1 NativeReader (org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader)1 DynamicSplitResult (org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader.DynamicSplitResult)1 Reiterable (org.apache.beam.sdk.util.common.Reiterable)1 KV (org.apache.beam.sdk.values.KV)1