Search in sources :

Example 16 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmReaderImpl method getBlock.

/**
 * Returns a map from key to value, where the keys are in increasing lexicographical order. If the
 * requested key is not contained within this file, an empty map is returned.
 */
private NavigableMap<RandomAccessData, WindowedValue<IsmRecord<V>>> getBlock(RandomAccessData keyBytes, int shardId, SideInputReadCounter readCounter) throws IOException {
    Optional<SeekableByteChannel> inChannel = initializeFooterAndShardIndex(Optional.<SeekableByteChannel>absent(), readCounter);
    // Key is not stored here so return an empty map.
    if (!shardIdToShardMap.containsKey(shardId) || !bloomFilterMightContain(keyBytes)) {
        return ImmutableSortedMap.<RandomAccessData, WindowedValue<IsmRecord<V>>>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).build();
    }
    inChannel = initializeForKeyedRead(shardId, inChannel, readCounter);
    closeIfPresent(inChannel);
    final NavigableMap<RandomAccessData, IsmShardKey> indexInShard = indexPerShard.get(shardId);
    final IsmShardKey cacheEntry = indexInShard.floorEntry(keyBytes).getValue();
    try (Closeable readerCloseable = IsmReader.setSideInputReadContext(readCounter)) {
        return fetch(cacheEntry);
    }
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) Closeable(java.io.Closeable) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)

Example 17 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmSideInputReaderTest method testMultimapViewInWindow.

@Test
public void testMultimapViewInWindow() throws Exception {
    // Note that we purposely use byte[]s as keys to force structural equality testing
    // versus using java equality testing. Since we want to define a duplicate key for
    // the multimap, we specifically use the same instance of the byte[].
    byte[] duplicateKey = new byte[] { 0x01 };
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), INTERVAL_WINDOW_CODER);
    final ListMultimap<byte[], WindowedValue<Long>> firstWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(12L, 10)).put(duplicateKey, valueInIntervalWindow(22L, 10)).put(duplicateKey, valueInIntervalWindow(23L, 10)).put(new byte[] { 0x02 }, valueInIntervalWindow(32L, 10)).build();
    final ListMultimap<byte[], WindowedValue<Long>> secondWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(42L, 20)).put(new byte[] { 0x03 }, valueInIntervalWindow(52L, 20)).put(new byte[] { 0x02 }, valueInIntervalWindow(62L, 20)).build();
    final ListMultimap<byte[], WindowedValue<Long>> thirdWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x02 }, valueInIntervalWindow(73L, 30)).put(new byte[] { 0x04 }, valueInIntervalWindow(82L, 30)).put(new byte[] { 0x05 }, valueInIntervalWindow(92L, 30)).build();
    final PCollectionView<MultimapView<byte[], WindowedValue<Long>>> view = DataflowPortabilityPCollectionView.with(new TupleTag<>(), FullWindowedValueCoder.of(KvCoder.of(ByteArrayCoder.of(), valueCoder), INTERVAL_WINDOW_CODER));
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(ByteArrayCoder.of(), INTERVAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    Multimap<Integer, IsmRecord<WindowedValue<Long>>> elementsPerShard = forMap(ismCoder, firstWindow);
    elementsPerShard.putAll(forMap(ismCoder, secondWindow));
    elementsPerShard.putAll(forMap(ismCoder, thirdWindow));
    List<IsmRecord<WindowedValue<Long>>> firstElements = new ArrayList<>();
    List<IsmRecord<WindowedValue<Long>>> secondElements = new ArrayList<>();
    for (Map.Entry<Integer, Collection<IsmRecord<WindowedValue<Long>>>> entry : elementsPerShard.asMap().entrySet()) {
        if (entry.getKey() % 2 == 0) {
            firstElements.addAll(entry.getValue());
        } else {
            secondElements.addAll(entry.getValue());
        }
    }
    // Ensure that each file will have some records.
    checkState(!firstElements.isEmpty());
    checkState(!secondElements.isEmpty());
    Source sourceA = initInputFile(firstElements, ismCoder);
    Source sourceB = initInputFile(secondElements, ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
    List<Callable<Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>>>> tasks = new ArrayList<>();
    for (int i = 0; i < 3; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            MultimapView<byte[], WindowedValue<Long>> firstValues = reader.get(view, intervalWindow(10));
            MultimapView<byte[], WindowedValue<Long>> secondValues = reader.get(view, intervalWindow(20));
            MultimapView<byte[], WindowedValue<Long>> thirdValues = reader.get(view, intervalWindow(30));
            for (Map.Entry<byte[], Collection<WindowedValue<Long>>> entry : firstWindow.asMap().entrySet()) {
                verifyIterable(entry.getValue(), firstValues.get(entry.getKey()));
            }
            for (Map.Entry<byte[], Collection<WindowedValue<Long>>> entry : secondWindow.asMap().entrySet()) {
                verifyIterable(entry.getValue(), secondValues.get(entry.getKey()));
            }
            for (Map.Entry<byte[], Collection<WindowedValue<Long>>> entry : thirdWindow.asMap().entrySet()) {
                verifyIterable(entry.getValue(), thirdValues.get(entry.getKey()));
            }
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(firstValues, reader.get(view, intervalWindow(10)));
            assertSame(secondValues, reader.get(view, intervalWindow(20)));
            assertSame(thirdValues, reader.get(view, intervalWindow(30)));
            return ImmutableMap.of(intervalWindow(10), firstValues, intervalWindow(20), secondValues, intervalWindow(30), thirdValues);
        });
    }
    List<Future<Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>>> result : results) {
        assertEquals(value, result.get());
        for (Map.Entry<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>> entry : result.get().entrySet()) {
            assertSame(value.get(entry.getKey()), entry.getValue());
        }
    }
}
Also used : MultimapView(org.apache.beam.sdk.transforms.Materializations.MultimapView) ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Collection(java.util.Collection) Future(java.util.concurrent.Future) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 18 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmSideInputReaderTest method initInputFile.

/**
 * Write input elements to the given file and return the corresponding IsmSource.
 */
private <K, V> Source initInputFile(Iterable<IsmRecord<WindowedValue<V>>> elements, IsmRecordCoder<WindowedValue<V>> coder, String tmpFilePath) throws Exception {
    // Group the keys by shard and sort the values within a shard by the composite key.
    Map<Integer, SortedMap<RandomAccessData, IsmRecord<WindowedValue<V>>>> writeOrder = new HashMap<>();
    for (IsmRecord<WindowedValue<V>> element : elements) {
        int shardId = coder.hash(element.getKeyComponents());
        if (!writeOrder.containsKey(shardId)) {
            writeOrder.put(shardId, new TreeMap<RandomAccessData, IsmRecord<WindowedValue<V>>>(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR));
        }
        RandomAccessData data = encodeKeyPortion(coder, element);
        writeOrder.get(shardId).put(data, element);
    }
    IsmSink<WindowedValue<V>> sink = new IsmSink<>(FileSystems.matchNewResource(tmpFilePath, false), coder, BLOOM_FILTER_SIZE_LIMIT);
    try (SinkWriter<WindowedValue<IsmRecord<WindowedValue<V>>>> writer = sink.writer()) {
        for (Entry<Integer, SortedMap<RandomAccessData, IsmRecord<WindowedValue<V>>>> entry : writeOrder.entrySet()) {
            for (IsmRecord<WindowedValue<V>> record : entry.getValue().values()) {
                writer.add(new ValueInEmptyWindows<>(record));
            }
        }
    }
    return newIsmSource(coder, tmpFilePath);
}
Also used : HashMap(java.util.HashMap) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) SortedMap(java.util.SortedMap)

Example 19 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmSideInputReaderTest method testMultimapInWindow.

@Test
public void testMultimapInWindow() throws Exception {
    // Note that we purposely use byte[]s as keys to force structural equality testing
    // versus using java equality testing.
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), INTERVAL_WINDOW_CODER);
    final ListMultimap<byte[], WindowedValue<Long>> firstWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(12L, 10)).put(new byte[] { 0x01 }, valueInIntervalWindow(22L, 10)).put(new byte[] { 0x02 }, valueInIntervalWindow(32L, 10)).build();
    final ListMultimap<byte[], WindowedValue<Long>> secondWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(42L, 20)).put(new byte[] { 0x03 }, valueInIntervalWindow(52L, 20)).put(new byte[] { 0x02 }, valueInIntervalWindow(62L, 20)).build();
    final ListMultimap<byte[], WindowedValue<Long>> thirdWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x02 }, valueInIntervalWindow(72L, 30)).put(new byte[] { 0x04 }, valueInIntervalWindow(82L, 30)).put(new byte[] { 0x05 }, valueInIntervalWindow(92L, 30)).build();
    final PCollectionView<Map<byte[], Iterable<Long>>> view = Pipeline.create().apply(Create.empty(KvCoder.of(ByteArrayCoder.of(), VarLongCoder.of()))).apply(Window.into(FixedWindows.of(Duration.millis(10)))).apply(View.asMultimap());
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 2, ImmutableList.of(MetadataKeyCoder.of(ByteArrayCoder.of()), INTERVAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    Multimap<Integer, IsmRecord<WindowedValue<Long>>> elementsPerShard = forMap(ismCoder, firstWindow);
    elementsPerShard.putAll(forMap(ismCoder, secondWindow));
    elementsPerShard.putAll(forMap(ismCoder, thirdWindow));
    List<IsmRecord<WindowedValue<Long>>> firstElements = new ArrayList<>();
    List<IsmRecord<WindowedValue<Long>>> secondElements = new ArrayList<>();
    for (Map.Entry<Integer, Collection<IsmRecord<WindowedValue<Long>>>> entry : elementsPerShard.asMap().entrySet()) {
        if (entry.getKey() % 2 == 0) {
            firstElements.addAll(entry.getValue());
        } else {
            secondElements.addAll(entry.getValue());
        }
    }
    // Ensure that each file will have some records.
    checkState(!firstElements.isEmpty());
    checkState(!secondElements.isEmpty());
    Source sourceA = initInputFile(firstElements, ismCoder);
    Source sourceB = initInputFile(secondElements, ismCoder);
    List<IsmRecord<WindowedValue<Long>>> firstWindowMapMetadata = forMapMetadata(ByteArrayCoder.of(), firstWindow.keySet(), intervalWindow(10));
    List<IsmRecord<WindowedValue<Long>>> secondWindowMapMetadata = forMapMetadata(ByteArrayCoder.of(), secondWindow.keySet(), intervalWindow(20));
    List<IsmRecord<WindowedValue<Long>>> thirdWindowMapMetadata = forMapMetadata(ByteArrayCoder.of(), thirdWindow.keySet(), intervalWindow(30));
    Source sourceMetaA = initInputFile(firstWindowMapMetadata, ismCoder);
    Source sourceMetaB = initInputFile(concat(secondWindowMapMetadata, thirdWindowMapMetadata), ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB, sourceMetaA, sourceMetaB);
    List<Callable<Map<BoundedWindow, Map<byte[], Iterable<Long>>>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Map<byte[], Iterable<Long>> firstValues = reader.get(view, intervalWindow(10));
            Map<byte[], Iterable<Long>> secondValues = reader.get(view, intervalWindow(20));
            Map<byte[], Iterable<Long>> thirdValues = reader.get(view, intervalWindow(30));
            verifyMap(Maps.transformValues(firstWindow.asMap(), new TransformForMultimap<Long>()), firstValues, new ComparatorForMultimap<Long>());
            verifyMap(Maps.transformValues(secondWindow.asMap(), new TransformForMultimap<Long>()), secondValues, new ComparatorForMultimap<Long>());
            verifyMap(Maps.transformValues(thirdWindow.asMap(), new TransformForMultimap<Long>()), thirdValues, new ComparatorForMultimap<Long>());
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(firstValues, reader.get(view, intervalWindow(10)));
            assertSame(secondValues, reader.get(view, intervalWindow(20)));
            assertSame(thirdValues, reader.get(view, intervalWindow(30)));
            // Also verify when requesting a window that is not part of the side input
            assertEquals(Collections.EMPTY_MAP, reader.get(view, intervalWindow(40)));
            return ImmutableMap.<BoundedWindow, Map<byte[], Iterable<Long>>>of(intervalWindow(10), firstValues, intervalWindow(20), secondValues, intervalWindow(30), thirdValues);
        });
    }
    List<Future<Map<BoundedWindow, Map<byte[], Iterable<Long>>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<BoundedWindow, Map<byte[], Iterable<Long>>> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<BoundedWindow, Map<byte[], Iterable<Long>>>> result : results) {
        assertEquals(value, result.get());
        for (Map.Entry<BoundedWindow, Map<byte[], Iterable<Long>>> entry : result.get().entrySet()) {
            assertSame(value.get(entry.getKey()), entry.getValue());
        }
    }
}
Also used : ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Collection(java.util.Collection) Future(java.util.concurrent.Future) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 20 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmSideInputReaderTest method testMultimap.

@Test
public void testMultimap() throws Exception {
    // Note that we purposely use byte[]s as keys to force structural equality testing
    // versus using java equality testing.
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    final ListMultimap<byte[], WindowedValue<Long>> elements = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInGlobalWindow(12L)).put(new byte[] { 0x01 }, valueInGlobalWindow(22L)).put(new byte[] { 0x02 }, valueInGlobalWindow(32L)).put(new byte[] { 0x03 }, valueInGlobalWindow(42L)).put(new byte[] { 0x04 }, valueInGlobalWindow(52L)).put(new byte[] { 0x05 }, valueInGlobalWindow(62L)).build();
    final PCollectionView<Map<byte[], Iterable<Long>>> view = Pipeline.create().apply(Create.empty(KvCoder.of(ByteArrayCoder.of(), VarLongCoder.of()))).apply(View.asMultimap());
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 2, ImmutableList.of(MetadataKeyCoder.of(ByteArrayCoder.of()), GLOBAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
    Multimap<Integer, IsmRecord<WindowedValue<Long>>> elementsPerShard = forMap(ismCoder, elements);
    List<IsmRecord<WindowedValue<Long>>> firstElements = new ArrayList<>();
    List<IsmRecord<WindowedValue<Long>>> secondElements = new ArrayList<>();
    for (Map.Entry<Integer, Collection<IsmRecord<WindowedValue<Long>>>> entry : elementsPerShard.asMap().entrySet()) {
        if (entry.getKey() % 2 == 0) {
            firstElements.addAll(entry.getValue());
        } else {
            secondElements.addAll(entry.getValue());
        }
    }
    // Ensure that each file will have some records.
    checkState(!firstElements.isEmpty());
    checkState(!secondElements.isEmpty());
    Source sourceA = initInputFile(firstElements, ismCoder);
    Source sourceB = initInputFile(secondElements, ismCoder);
    List<IsmRecord<WindowedValue<Long>>> mapMetadata = forMapMetadata(ByteArrayCoder.of(), elements.keySet(), GlobalWindow.INSTANCE);
    Source sourceMeta = initInputFile(mapMetadata, ismCoder);
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB, sourceMeta);
    List<Callable<Map<byte[], Iterable<Long>>>> tasks = new ArrayList<>();
    for (int i = 0; i < NUM_THREADS; ++i) {
        tasks.add(() -> {
            // Store a strong reference to the returned value so that the logical reference
            // cache is not cleared for this test.
            Map<byte[], Iterable<Long>> value = reader.get(view, GlobalWindow.INSTANCE);
            verifyMap(Maps.transformValues(elements.asMap(), new TransformForMultimap<Long>()), value, new ComparatorForMultimap<Long>());
            // Assert that the same value reference was returned showing that it was cached.
            assertSame(reader.get(view, GlobalWindow.INSTANCE), value);
            return value;
        });
    }
    List<Future<Map<byte[], Iterable<Long>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
    Map<byte[], Iterable<Long>> value = results.get(0).get();
    // Assert that all threads got back the same reference
    for (Future<Map<byte[], Iterable<Long>>> result : results) {
        assertSame(value, result.get());
    }
}
Also used : ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Source(com.google.api.services.dataflow.model.Source) Callable(java.util.concurrent.Callable) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Collection(java.util.Collection) Future(java.util.concurrent.Future) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Aggregations

IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)26 Test (org.junit.Test)17 WindowedValue (org.apache.beam.sdk.util.WindowedValue)16 ArrayList (java.util.ArrayList)12 File (java.io.File)8 KV (org.apache.beam.sdk.values.KV)8 HashMap (java.util.HashMap)7 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)7 Instant (org.joda.time.Instant)7 SortedMap (java.util.SortedMap)6 TreeMap (java.util.TreeMap)6 Callable (java.util.concurrent.Callable)6 Future (java.util.concurrent.Future)6 Source (com.google.api.services.dataflow.model.Source)5 Collection (java.util.Collection)5 Map (java.util.Map)5 RandomAccessData (org.apache.beam.runners.dataflow.util.RandomAccessData)5 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)5 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)3 TransformedMap (org.apache.beam.runners.dataflow.BatchViewOverrides.TransformedMap)2