Search in sources :

Example 16 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testIsmReaderReferenceCaching.

@Test
public void testIsmReaderReferenceCaching() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    final WindowedValue<Long> element = valueInGlobalWindow(42L);
    final PCollectionView<Long> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asSingleton());
    final Source source = initInputFile(fromValues(Arrays.asList(element)), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
    final Source emptySource = initInputFile(fromValues(Arrays.asList()), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), source, emptySource);
    assertTrue(reader.tagToIsmReaderMap.containsKey(view.getTagInternal()));
    assertEquals(1, reader.tagToIsmReaderMap.get(view.getTagInternal()).size());
    assertEquals(FileSystems.matchSingleFileSpec(getString(source.getSpec(), WorkerPropertyNames.FILENAME)).resourceId(), reader.tagToIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
    assertTrue(reader.tagToEmptyIsmReaderMap.containsKey(view.getTagInternal()));
    assertEquals(1, reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).size());
    assertEquals(FileSystems.matchSingleFileSpec(getString(emptySource.getSpec(), WorkerPropertyNames.FILENAME)).resourceId(), reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
}
Also used : KvCoder(org.apache.beam.sdk.coders.KvCoder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) MapCoder(org.apache.beam.sdk.coders.MapCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) BigEndianLongCoder(org.apache.beam.sdk.coders.BigEndianLongCoder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) MetadataKeyCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.MetadataKeyCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Example 17 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testMapInWindow.

@Test
public void testMapInWindow() throws Exception {
    // Note that we purposely use byte[]s as keys to force structural equality testing
    // versus using java equality testing.
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), INTERVAL_WINDOW_CODER);
    final ListMultimap<byte[], WindowedValue<Long>> firstWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(12L, 10)).put(new byte[] { 0x01 }, valueInIntervalWindow(22L, 10)).put(new byte[] { 0x02 }, valueInIntervalWindow(32L, 10)).build();
    final ListMultimap<byte[], WindowedValue<Long>> secondWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(42L, 20)).put(new byte[] { 0x03 }, valueInIntervalWindow(52L, 20)).put(new byte[] { 0x02 }, valueInIntervalWindow(62L, 20)).build();
    final ListMultimap<byte[], WindowedValue<Long>> thirdWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x02 }, valueInIntervalWindow(72L, 30)).put(new byte[] { 0x04 }, valueInIntervalWindow(82L, 30)).put(new byte[] { 0x05 }, valueInIntervalWindow(92L, 30)).build();
    final PCollectionView<Map<byte[], Long>> view = Pipeline.create().apply(Create.empty(KvCoder.of(ByteArrayCoder.of(), VarLongCoder.of()))).apply(Window.into(FixedWindows.of(Duration.millis(10)))).apply(View.asMap());
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 2, ImmutableList.of(MetadataKeyCoder.of(ByteArrayCoder.of()), INTERVAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    Multimap<Integer, IsmRecord<WindowedValue<Long>>> elementsPerShard = forMap(ismCoder, firstWindow);
    elementsPerShard.putAll(forMap(ismCoder, secondWindow));
    elementsPerShard.putAll(forMap(ismCoder, thirdWindow));
    List<IsmRecord<WindowedValue<Long>>> firstElements = new ArrayList<>();
    List<IsmRecord<WindowedValue<Long>>> secondElements = new ArrayList<>();
    for (Map.Entry<Integer, Collection<IsmRecord<WindowedValue<Long>>>> entry : elementsPerShard.asMap().entrySet()) {
        if (entry.getKey() % 2 == 0) {
            firstElements.addAll(entry.getValue());
        } else {
            secondElements.addAll(entry.getValue());
        }
    }
    // Ensure that each file will have some records.
    checkState(!firstElements.isEmpty());
    checkState(!secondElements.isEmpty());
    Source sourceA = initInputFile(firstElements, ismCoder);
    Source sourceB = initInputFile(secondElements, ismCoder);
    List<IsmRecord<WindowedValue<Long>>> firstWindowMapMetadata = forMapMetadata(ByteArrayCoder.of(), firstWindow.keySet(), intervalWindow(10));
    List<IsmRecord<WindowedValue<Long>>> secondWindowMapMetadata = forMapMetadata(ByteArrayCoder.of(), secondWindow.keySet(), intervalWindow(20));
    List<IsmRecord<WindowedValue<Long>>> thirdWindowMapMetadata = forMapMetadata(ByteArrayCoder.of(), thirdWindow.keySet(), intervalWindow(30));
    Source sourceMetaA = initInputFile(firstWindowMapMetadata, ismCoder);
    Source sourceMetaB = initInputFile(concat(secondWindowMapMetadata, thirdWindowMapMetadata), ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB, sourceMetaA, sourceMetaB);
    List<Callable<Map<BoundedWindow, Map<byte[], Long>>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Map<byte[], Long> firstValues = reader.get(view, intervalWindow(10));
            Map<byte[], Long> secondValues = reader.get(view, intervalWindow(20));
            Map<byte[], Long> thirdValues = reader.get(view, intervalWindow(30));
            verifyMap(Maps.transformValues(firstWindow.asMap(), new TransformForMap<Long>()), firstValues, new ComparatorForMap<Long>());
            verifyMap(Maps.transformValues(secondWindow.asMap(), new TransformForMap<Long>()), secondValues, new ComparatorForMap<Long>());
            verifyMap(Maps.transformValues(thirdWindow.asMap(), new TransformForMap<Long>()), thirdValues, new ComparatorForMap<Long>());
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(firstValues, reader.get(view, intervalWindow(10)));
            assertSame(secondValues, reader.get(view, intervalWindow(20)));
            assertSame(thirdValues, reader.get(view, intervalWindow(30)));
            // Also verify when requesting a window that is not part of the side input
            assertEquals(Collections.EMPTY_MAP, reader.get(view, intervalWindow(40)));
            return ImmutableMap.<BoundedWindow, Map<byte[], Long>>of(intervalWindow(10), firstValues, intervalWindow(20), secondValues, intervalWindow(30), thirdValues);
        });
    }
    List<Future<Map<BoundedWindow, Map<byte[], Long>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<BoundedWindow, Map<byte[], Long>> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<BoundedWindow, Map<byte[], Long>>> result : results) {
        assertEquals(value, result.get());
        for (Map.Entry<BoundedWindow, Map<byte[], Long>> entry : result.get().entrySet()) {
            assertSame(value.get(entry.getKey()), entry.getValue());
        }
    }
}
Also used : ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Collection(java.util.Collection) Future(java.util.concurrent.Future) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 18 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class InMemoryReaderFactoryTest method runTestCreateInMemoryReader.

<T> void runTestCreateInMemoryReader(List<T> elements, Long start, Long end, int expectedStart, int expectedEnd, Coder<T> coder) throws Exception {
    Source cloudSource = createInMemoryCloudSource(elements, start, end, coder);
    NativeReader<?> reader = ReaderRegistry.defaultRegistry().create(cloudSource, PipelineOptionsFactory.create(), BatchModeExecutionContext.forTesting(PipelineOptionsFactory.create(), "testStage"), TestOperationContext.create());
    assertThat(reader, new IsInstanceOf(InMemoryReader.class));
    InMemoryReader<?> inMemoryReader = (InMemoryReader<?>) reader;
    Assert.assertEquals(InMemoryReaderTest.encodedElements(elements, coder), inMemoryReader.encodedElements);
    Assert.assertEquals(expectedStart, inMemoryReader.startIndex);
    Assert.assertEquals(expectedEnd, inMemoryReader.endIndex);
    Assert.assertEquals(coder, inMemoryReader.coder);
}
Also used : IsInstanceOf(org.hamcrest.core.IsInstanceOf) Source(com.google.api.services.dataflow.model.Source)

Example 19 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class InMemoryReaderFactoryTest method createInMemoryCloudSource.

static <T> Source createInMemoryCloudSource(List<T> elements, Long start, Long end, Coder<T> coder) throws Exception {
    List<String> encodedElements = InMemoryReaderTest.encodedElements(elements, coder);
    CloudObject spec = CloudObject.forClassName("InMemorySource");
    addStringList(spec, WorkerPropertyNames.ELEMENTS, encodedElements);
    if (start != null) {
        addLong(spec, WorkerPropertyNames.START_INDEX, start);
    }
    if (end != null) {
        addLong(spec, WorkerPropertyNames.END_INDEX, end);
    }
    Source cloudSource = new Source();
    cloudSource.setSpec(spec);
    cloudSource.setCodec(CloudObjects.asCloudObject(coder, /*sdkComponents=*/
    null));
    return cloudSource;
}
Also used : CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Source(com.google.api.services.dataflow.model.Source)

Example 20 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class ReaderFactoryTest method testCreateReader.

@Test
public void testCreateReader() throws Exception {
    CloudObject spec = CloudObject.forClass(TestReaderFactory.class);
    Source cloudSource = new Source();
    cloudSource.setSpec(spec);
    cloudSource.setCodec(CloudObjects.asCloudObject(BigEndianIntegerCoder.of(), /*sdkComponents=*/
    null));
    PipelineOptions options = PipelineOptionsFactory.create();
    ReaderRegistry registry = ReaderRegistry.defaultRegistry().register(TestReaderFactory.class.getName(), new TestReaderFactory());
    NativeReader<?> reader = registry.create(cloudSource, PipelineOptionsFactory.create(), BatchModeExecutionContext.forTesting(options, "testStage"), null);
    assertThat(reader, new IsInstanceOf(TestReader.class));
}
Also used : CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) IsInstanceOf(org.hamcrest.core.IsInstanceOf) Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Aggregations

Source (com.google.api.services.dataflow.model.Source)51 Test (org.junit.Test)31 ArrayList (java.util.ArrayList)20 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)16 Map (java.util.Map)15 Callable (java.util.concurrent.Callable)15 Future (java.util.concurrent.Future)15 HashMap (java.util.HashMap)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)12 SortedMap (java.util.SortedMap)11 TreeMap (java.util.TreeMap)11 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)8 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)7 ReadInstruction (com.google.api.services.dataflow.model.ReadInstruction)6 KV (org.apache.beam.sdk.values.KV)6 Collection (java.util.Collection)5 List (java.util.List)5 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)5 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)5