Search in sources :

Example 11 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testList.

@Test
public void testList() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(GLOBAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    final List<KV<Long, WindowedValue<Long>>> firstElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(12L)), KV.of(1L, valueInGlobalWindow(22L)), KV.of(2L, valueInGlobalWindow(32L)));
    final List<KV<Long, WindowedValue<Long>>> secondElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(42L)), KV.of(1L, valueInGlobalWindow(52L)), KV.of(2L, valueInGlobalWindow(62L)));
    final PCollectionView<List<Long>> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asList());
    Source sourceA = initInputFile(fromKvsForList(firstElements), ismCoder);
    Source sourceB = initInputFile(fromKvsForList(secondElements), ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
    List<Callable<List<Long>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            List<Long> value = reader.get(view, GlobalWindow.INSTANCE);
            verifyList(toValueList(concat(firstElements, secondElements)), value);
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(reader.get(view, GlobalWindow.INSTANCE), value);
            return value;
        });
    }
    List<Future<List<Long>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    List<Long> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<List<Long>> result : results) {
        assertSame(value, result.get());
    }
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Future(java.util.concurrent.Future) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Test(org.junit.Test)

Example 12 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testSingleton.

@Test
public void testSingleton() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    final WindowedValue<Long> element = valueInGlobalWindow(42L);
    final PCollectionView<Long> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asSingleton());
    final Source source = initInputFile(fromValues(Arrays.asList(element)), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), source);
    List<Callable<Long>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Long value = reader.get(view, GlobalWindow.INSTANCE);
            assertEquals(element.getValue(), value);
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(value, reader.get(view, GlobalWindow.INSTANCE));
            return value;
        });
    }
    List<Future<Long>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    // Assert that all threads got back the same reference
    Long value = results.get(0).get();
    for (Future<Long> result : results) {
        assertSame(value, result.get());
    }
}
Also used : KvCoder(org.apache.beam.sdk.coders.KvCoder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) MapCoder(org.apache.beam.sdk.coders.MapCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) BigEndianLongCoder(org.apache.beam.sdk.coders.BigEndianLongCoder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) MetadataKeyCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.MetadataKeyCoder) ArrayList(java.util.ArrayList) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Future(java.util.concurrent.Future) Test(org.junit.Test)

Example 13 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testListInWindow.

@Test
public void testListInWindow() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), INTERVAL_WINDOW_CODER);
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(INTERVAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    final List<KV<Long, WindowedValue<Long>>> firstElements = Arrays.asList(KV.of(0L, valueInIntervalWindow(12, 10)), KV.of(1L, valueInIntervalWindow(22, 10)), KV.of(2L, valueInIntervalWindow(32, 10)));
    final List<KV<Long, WindowedValue<Long>>> secondElements = Arrays.asList(KV.of(0L, valueInIntervalWindow(42, 20)), KV.of(1L, valueInIntervalWindow(52, 20)), KV.of(2L, valueInIntervalWindow(62, 20)));
    final List<KV<Long, WindowedValue<Long>>> thirdElements = Arrays.asList(KV.of(0L, valueInIntervalWindow(42L, 30)), KV.of(1L, valueInIntervalWindow(52L, 30)), KV.of(2L, valueInIntervalWindow(62L, 30)));
    final PCollectionView<List<Long>> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(Window.into(FixedWindows.of(Duration.millis(10)))).apply(View.asList());
    Source sourceA = initInputFile(fromKvsForList(concat(firstElements, secondElements)), ismCoder);
    Source sourceB = initInputFile(fromKvsForList(thirdElements), ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
    List<Callable<Map<BoundedWindow, List<Long>>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            List<Long> firstValues = reader.get(view, intervalWindow(10));
            List<Long> secondValues = reader.get(view, intervalWindow(20));
            List<Long> thirdValues = reader.get(view, intervalWindow(30));
            verifyList(toValueList(firstElements), firstValues);
            verifyList(toValueList(secondElements), secondValues);
            verifyList(toValueList(thirdElements), thirdValues);
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(firstValues, reader.get(view, intervalWindow(10)));
            assertSame(secondValues, reader.get(view, intervalWindow(20)));
            assertSame(thirdValues, reader.get(view, intervalWindow(30)));
            // Also verify when requesting a window that is not part of the side input
            assertEquals(Collections.EMPTY_LIST, reader.get(view, intervalWindow(40)));
            return ImmutableMap.<BoundedWindow, List<Long>>of(intervalWindow(10), firstValues, intervalWindow(20), secondValues, intervalWindow(30), thirdValues);
        });
    }
    List<Future<Map<BoundedWindow, List<Long>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<BoundedWindow, List<Long>> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<BoundedWindow, List<Long>>> result : results) {
        assertEquals(value, result.get());
        for (Map.Entry<BoundedWindow, List<Long>> entry : result.get().entrySet()) {
            assertSame(value.get(entry.getKey()), entry.getValue());
        }
    }
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Future(java.util.concurrent.Future) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 14 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testIterableSideInputReadCounter.

@Test
public void testIterableSideInputReadCounter() throws Exception {
    // These are the expected msec and byte counters:
    CounterUpdate expectedSideInputMsecUpdate = new CounterUpdate().setStructuredNameAndMetadata(new CounterStructuredNameAndMetadata().setMetadata(new CounterMetadata().setKind(Kind.SUM.toString())).setName(new CounterStructuredName().setOrigin("SYSTEM").setName("read-sideinput-msecs").setOriginalStepName("originalName").setExecutionStepName("stageName").setOriginalRequestingStepName("originalName2").setInputIndex(1))).setCumulative(true).setInteger(new SplitInt64().setHighBits(0).setLowBits(0L));
    CounterName expectedCounterName = CounterName.named("read-sideinput-byte-count").withOriginalName(operationContext.nameContext()).withOrigin("SYSTEM").withOriginalRequestingStepName("originalName2").withInputIndex(1);
    // Test startup:
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(GLOBAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    // Create a new state, which represents a step that receives the side input.
    DataflowExecutionState state2 = executionContext.getExecutionStateRegistry().getState(NameContext.create("stageName", "originalName2", "systemName2", "userName2"), "process", null, NoopProfileScope.NOOP);
    final List<KV<Long, WindowedValue<Long>>> firstElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(0L)));
    final List<KV<Long, WindowedValue<Long>>> secondElements = new ArrayList<>();
    for (long i = 0; i < 100; i++) {
        secondElements.add(KV.of(i, valueInGlobalWindow(i * 10)));
    }
    final PCollectionView<Iterable<Long>> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asIterable());
    Source sourceA = initInputFile(fromKvsForList(firstElements), ismCoder);
    Source sourceB = initInputFile(fromKvsForList(secondElements), ismCoder);
    try (Closeable state2Closeable = executionContext.getExecutionStateTracker().enterState(state2)) {
        final IsmSideInputReader reader = serialSideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
        // Store a strong reference to the returned value so that the logical reference
        // cache is not cleared for this test.
        Iterable<Long> value = reader.get(view, GlobalWindow.INSTANCE);
        verifyIterable(toValueList(concat(firstElements, secondElements)), value);
        // Assert that the same value reference was returned showing that it was cached.
        assertSame(reader.get(view, GlobalWindow.INSTANCE), value);
        Iterable<CounterUpdate> counterUpdates = executionContext.getExecutionStateRegistry().extractUpdates(true);
        assertThat(counterUpdates, hasItem(expectedSideInputMsecUpdate));
        Counter<?, ?> expectedCounter = counterFactory.getExistingCounter(expectedCounterName);
        assertNotNull(expectedCounter);
    }
}
Also used : CounterMetadata(com.google.api.services.dataflow.model.CounterMetadata) CounterStructuredName(com.google.api.services.dataflow.model.CounterStructuredName) Closeable(java.io.Closeable) ArrayList(java.util.ArrayList) SplitInt64(com.google.api.services.dataflow.model.SplitInt64) KV(org.apache.beam.sdk.values.KV) Source(com.google.api.services.dataflow.model.Source) CounterUpdate(com.google.api.services.dataflow.model.CounterUpdate) DataflowExecutionState(org.apache.beam.runners.dataflow.worker.DataflowOperationContext.DataflowExecutionState) CounterName(org.apache.beam.runners.dataflow.worker.counters.CounterName) WindowedValue(org.apache.beam.sdk.util.WindowedValue) CounterStructuredNameAndMetadata(com.google.api.services.dataflow.model.CounterStructuredNameAndMetadata) Test(org.junit.Test)

Example 15 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testIterableAtN.

@Test
public void testIterableAtN() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(GLOBAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    final List<KV<Long, WindowedValue<Long>>> firstElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(12L)), KV.of(1L, valueInGlobalWindow(22L)), KV.of(2L, valueInGlobalWindow(32L)));
    final List<KV<Long, WindowedValue<Long>>> secondElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(42L)), KV.of(1L, valueInGlobalWindow(52L)), KV.of(2L, valueInGlobalWindow(62L)));
    final PCollectionView<Iterable<Long>> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asIterable());
    String tmpFilePrefix = tmpFolder.newFile().getPath();
    initInputFile(fromKvsForList(firstElements), ismCoder, tmpFilePrefix + "-00000-of-00002.ism");
    initInputFile(fromKvsForList(secondElements), ismCoder, tmpFilePrefix + "-00001-of-00002.ism");
    Source source = newIsmSource(ismCoder, tmpFilePrefix + "@2.ism");
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), source);
    List<Callable<Iterable<Long>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Iterable<Long> value = reader.get(view, GlobalWindow.INSTANCE);
            verifyIterable(toValueList(concat(firstElements, secondElements)), value);
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(reader.get(view, GlobalWindow.INSTANCE), value);
            return value;
        });
    }
    List<Future<Iterable<Long>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Iterable<Long> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Iterable<Long>> result : results) {
        assertSame(value, result.get());
    }
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Future(java.util.concurrent.Future) Test(org.junit.Test)

Aggregations

Source (com.google.api.services.dataflow.model.Source)51 Test (org.junit.Test)31 ArrayList (java.util.ArrayList)20 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)16 Map (java.util.Map)15 Callable (java.util.concurrent.Callable)15 Future (java.util.concurrent.Future)15 HashMap (java.util.HashMap)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)12 SortedMap (java.util.SortedMap)11 TreeMap (java.util.TreeMap)11 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)8 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)7 ReadInstruction (com.google.api.services.dataflow.model.ReadInstruction)6 KV (org.apache.beam.sdk.values.KV)6 Collection (java.util.Collection)5 List (java.util.List)5 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)5 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)5