Search in sources :

Example 6 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class LengthPrefixUnknownCodersTest method createSideInputInfosWithCoders.

private static SideInputInfo createSideInputInfosWithCoders(Coder<?>... coders) {
    SideInputInfo sideInputInfo = new SideInputInfo().setSources(new ArrayList<>());
    sideInputInfo.setFactory(new JacksonFactory());
    for (Coder<?> coder : coders) {
        Source source = new Source().setCodec(CloudObjects.asCloudObject(coder, /*sdkComponents=*/
        null));
        source.setFactory(new JacksonFactory());
        sideInputInfo.getSources().add(source);
    }
    return sideInputInfo;
}
Also used : SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) JacksonFactory(com.google.api.client.json.jackson2.JacksonFactory) Source(com.google.api.services.dataflow.model.Source)

Example 7 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class CloudSourceUtilsTest method testFlattenBaseSpecs.

@Test
public void testFlattenBaseSpecs() throws Exception {
    // G = grandparent, P = parent, C = child.
    CloudObject grandparent = CloudObject.forClassName("text");
    addString(grandparent, "G", "g_g");
    addString(grandparent, "GP", "gp_g");
    addString(grandparent, "GC", "gc_g");
    addString(grandparent, "GPC", "gpc_g");
    CloudObject parent = CloudObject.forClassName("text");
    addString(parent, "P", "p_p");
    addString(parent, "PC", "pc_p");
    addString(parent, "GP", "gp_p");
    addString(parent, "GPC", "gpc_p");
    CloudObject child = CloudObject.forClassName("text");
    addString(child, "C", "c_c");
    addString(child, "PC", "pc_c");
    addString(child, "GC", "gc_c");
    addString(child, "GPC", "gpc_c");
    Source source = new Source();
    source.setBaseSpecs(new ArrayList<Map<String, Object>>());
    source.getBaseSpecs().add(grandparent);
    source.getBaseSpecs().add(parent);
    source.setSpec(child);
    source.setCodec(CloudObjects.asCloudObject(StringUtf8Coder.of(), /*sdkComponents=*/
    null));
    Source flat = CloudSourceUtils.flattenBaseSpecs(source);
    assertNull(flat.getBaseSpecs());
    assertEquals(StringUtf8Coder.class.getName(), getString(flat.getCodec(), PropertyNames.OBJECT_TYPE_NAME));
    CloudObject flatSpec = CloudObject.fromSpec(flat.getSpec());
    assertEquals("g_g", getString(flatSpec, "G"));
    assertEquals("p_p", getString(flatSpec, "P"));
    assertEquals("c_c", getString(flatSpec, "C"));
    assertEquals("gp_p", getString(flatSpec, "GP"));
    assertEquals("gc_c", getString(flatSpec, "GC"));
    assertEquals("pc_c", getString(flatSpec, "PC"));
    assertEquals("gpc_c", getString(flatSpec, "GPC"));
}
Also used : CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) Map(java.util.Map) Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Example 8 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testIterableInWindow.

@Test
public void testIterableInWindow() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), INTERVAL_WINDOW_CODER);
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(INTERVAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    final List<KV<Long, WindowedValue<Long>>> firstElements = Arrays.asList(KV.of(0L, valueInIntervalWindow(12, 10)), KV.of(1L, valueInIntervalWindow(22, 10)), KV.of(2L, valueInIntervalWindow(32, 10)));
    final List<KV<Long, WindowedValue<Long>>> secondElements = Arrays.asList(KV.of(0L, valueInIntervalWindow(42, 20)), KV.of(1L, valueInIntervalWindow(52, 20)), KV.of(2L, valueInIntervalWindow(62, 20)));
    final List<KV<Long, WindowedValue<Long>>> thirdElements = Arrays.asList(KV.of(0L, valueInIntervalWindow(42L, 30)), KV.of(1L, valueInIntervalWindow(52L, 30)), KV.of(2L, valueInIntervalWindow(62L, 30)));
    final PCollectionView<Iterable<Long>> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(Window.into(FixedWindows.of(Duration.millis(10)))).apply(View.asIterable());
    Source sourceA = initInputFile(fromKvsForList(concat(firstElements, secondElements)), ismCoder);
    Source sourceB = initInputFile(fromKvsForList(thirdElements), ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
    List<Callable<Map<BoundedWindow, Iterable<Long>>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Iterable<Long> firstValues = reader.get(view, intervalWindow(10));
            Iterable<Long> secondValues = reader.get(view, intervalWindow(20));
            Iterable<Long> thirdValues = reader.get(view, intervalWindow(30));
            verifyIterable(toValueList(firstElements), firstValues);
            verifyIterable(toValueList(secondElements), secondValues);
            verifyIterable(toValueList(thirdElements), thirdValues);
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(firstValues, reader.get(view, intervalWindow(10)));
            assertSame(secondValues, reader.get(view, intervalWindow(20)));
            assertSame(thirdValues, reader.get(view, intervalWindow(30)));
            return ImmutableMap.<BoundedWindow, Iterable<Long>>of(intervalWindow(10), firstValues, intervalWindow(20), secondValues, intervalWindow(30), thirdValues);
        });
    }
    List<Future<Map<BoundedWindow, Iterable<Long>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<BoundedWindow, Iterable<Long>> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<BoundedWindow, Iterable<Long>>> result : results) {
        assertEquals(value, result.get());
        for (Map.Entry<BoundedWindow, Iterable<Long>> entry : result.get().entrySet()) {
            assertSame(value.get(entry.getKey()), entry.getValue());
        }
    }
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Future(java.util.concurrent.Future) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 9 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testMap.

@Test
public void testMap() throws Exception {
    // Note that we purposely use byte[]s as keys to force structural equality testing
    // versus using java equality testing.
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    final ListMultimap<byte[], WindowedValue<Long>> elements = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInGlobalWindow(12L)).put(new byte[] { 0x01 }, valueInGlobalWindow(22L)).put(new byte[] { 0x02 }, valueInGlobalWindow(32L)).put(new byte[] { 0x03 }, valueInGlobalWindow(42L)).put(new byte[] { 0x04 }, valueInGlobalWindow(52L)).put(new byte[] { 0x05 }, valueInGlobalWindow(62L)).build();
    final PCollectionView<Map<byte[], Long>> view = Pipeline.create().apply(Create.empty(KvCoder.of(ByteArrayCoder.of(), VarLongCoder.of()))).apply(View.asMap());
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 2, ImmutableList.of(MetadataKeyCoder.of(ByteArrayCoder.of()), GLOBAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    Multimap<Integer, IsmRecord<WindowedValue<Long>>> elementsPerShard = forMap(ismCoder, elements);
    List<IsmRecord<WindowedValue<Long>>> firstElements = new ArrayList<>();
    List<IsmRecord<WindowedValue<Long>>> secondElements = new ArrayList<>();
    for (Map.Entry<Integer, Collection<IsmRecord<WindowedValue<Long>>>> entry : elementsPerShard.asMap().entrySet()) {
        if (entry.getKey() % 2 == 0) {
            firstElements.addAll(entry.getValue());
        } else {
            secondElements.addAll(entry.getValue());
        }
    }
    // Ensure that each file will have some records.
    checkState(!firstElements.isEmpty());
    checkState(!secondElements.isEmpty());
    Source sourceA = initInputFile(firstElements, ismCoder);
    Source sourceB = initInputFile(secondElements, ismCoder);
    List<IsmRecord<WindowedValue<Long>>> mapMetadata = forMapMetadata(ByteArrayCoder.of(), elements.keySet(), GlobalWindow.INSTANCE);
    Source sourceMeta = initInputFile(mapMetadata, ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB, sourceMeta);
    List<Callable<Map<byte[], Long>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Map<byte[], Long> value = reader.get(view, GlobalWindow.INSTANCE);
            verifyMap(Maps.transformValues(elements.asMap(), new TransformForMap<Long>()), value, new ComparatorForMap<Long>());
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(reader.get(view, GlobalWindow.INSTANCE), value);
            return value;
        });
    }
    List<Future<Map<byte[], Long>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<byte[], Long> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<byte[], Long>> result : results) {
        assertSame(value, result.get());
    }
}
Also used : ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Collection(java.util.Collection) Future(java.util.concurrent.Future) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 10 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class IsmSideInputReaderTest method testIterable.

@Test
public void testIterable() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(GLOBAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    final List<KV<Long, WindowedValue<Long>>> firstElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(12L)), KV.of(1L, valueInGlobalWindow(22L)), KV.of(2L, valueInGlobalWindow(32L)));
    final List<KV<Long, WindowedValue<Long>>> secondElements = Arrays.asList(KV.of(0L, valueInGlobalWindow(42L)), KV.of(1L, valueInGlobalWindow(52L)), KV.of(2L, valueInGlobalWindow(62L)));
    final PCollectionView<Iterable<Long>> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asIterable());
    Source sourceA = initInputFile(fromKvsForList(firstElements), ismCoder);
    Source sourceB = initInputFile(fromKvsForList(secondElements), ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
    List<Callable<Iterable<Long>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Iterable<Long> value = reader.get(view, GlobalWindow.INSTANCE);
            verifyIterable(toValueList(concat(firstElements, secondElements)), value);
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(reader.get(view, GlobalWindow.INSTANCE), value);
            return value;
        });
    }
    List<Future<Iterable<Long>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Iterable<Long> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Iterable<Long>> result : results) {
        assertSame(value, result.get());
    }
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Future(java.util.concurrent.Future) Test(org.junit.Test)

Aggregations

Source (com.google.api.services.dataflow.model.Source)51 Test (org.junit.Test)31 ArrayList (java.util.ArrayList)20 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)16 Map (java.util.Map)15 Callable (java.util.concurrent.Callable)15 Future (java.util.concurrent.Future)15 HashMap (java.util.HashMap)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)12 SortedMap (java.util.SortedMap)11 TreeMap (java.util.TreeMap)11 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)8 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)7 ReadInstruction (com.google.api.services.dataflow.model.ReadInstruction)6 KV (org.apache.beam.sdk.values.KV)6 Collection (java.util.Collection)5 List (java.util.List)5 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)5 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)5