Search in sources :

Example 1 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderImpl method overKeyComponents.

@Override
public IsmPrefixReaderIterator overKeyComponents(List<?> keyComponents, int shardId, RandomAccessData keyBytes) throws IOException {
    checkNotNull(keyComponents);
    checkNotNull(keyBytes);
    SideInputReadCounter readCounter = IsmReader.getCurrentSideInputCounter();
    if (keyComponents.isEmpty()) {
        checkArgument(shardId == 0 && keyBytes.size() == 0, "Expected shard id to be 0 and key bytes to be empty " + "but got shard id %s and key bytes of length %s", shardId, keyBytes.size());
    }
    checkArgument(keyComponents.size() <= coder.getKeyComponentCoders().size(), "Expected at most %s key component(s) but received %s.", coder.getKeyComponentCoders().size(), keyComponents);
    Optional<SeekableByteChannel> inChannel = initializeFooterAndShardIndex(Optional.<SeekableByteChannel>absent(), readCounter);
    // If this file is empty, we can return an empty iterator.
    if (footer.getNumberOfKeys() == 0) {
        return new EmptyIsmPrefixReaderIterator(keyComponents);
    }
    // iterator over all the keys.
    if (keyComponents.size() < coder.getNumberOfShardKeyCoders(keyComponents)) {
        return new ShardAwareIsmPrefixReaderIterator(keyComponents, openIfNeeded(inChannel), readCounter);
    }
    // we know that we can return an empty reader iterator.
    if (!shardIdToShardMap.containsKey(shardId)) {
        return new EmptyIsmPrefixReaderIterator(keyComponents);
    }
    inChannel = initializeForKeyedRead(shardId, inChannel, readCounter);
    closeIfPresent(inChannel);
    if (!bloomFilterMightContain(keyBytes)) {
        return new EmptyIsmPrefixReaderIterator(keyComponents);
    }
    // Otherwise we may actually contain the key so construct a reader iterator
    // which will fetch the data blocks containing the requested key prefix.
    // We find the first key in the index which may contain our prefix
    RandomAccessData floorKey = indexPerShard.get(shardId).floorKey(keyBytes);
    // We compute an upper bound on the key prefix by incrementing the prefix
    RandomAccessData keyBytesUpperBound = keyBytes.increment();
    // Compute the sub-range of the index map that we want to iterate over since
    // any of these blocks may contain the key prefix.
    Iterator<IsmShardKey> blockEntries = indexPerShard.get(shardId).subMap(floorKey, keyBytesUpperBound).values().iterator();
    return new WithinShardIsmPrefixReaderIterator(keyComponents, keyBytes, keyBytesUpperBound, blockEntries, readCounter);
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData)

Example 2 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderImpl method initializeBloomFilterAndIndexPerShard.

/**
 * Initializes the Bloom filter and index per shard. We prepopulate empty indices for shards where
 * the index offset matches the following shard block offset. Re-uses the provided channel,
 * returning it or a new one if this method was required to open one.
 */
private synchronized Optional<SeekableByteChannel> initializeBloomFilterAndIndexPerShard(Optional<SeekableByteChannel> inChannel) throws IOException {
    if (indexPerShard != null) {
        checkState(bloomFilter != null, "Expected Bloom filter to have been initialized.");
        return inChannel;
    }
    SeekableByteChannel rawChannel = openIfNeeded(inChannel);
    // Set the position to where the bloom filter is and read it in.
    position(rawChannel, footer.getBloomFilterPosition());
    bloomFilter = ScalableBloomFilterCoder.of().decode(Channels.newInputStream(rawChannel));
    indexPerShard = new HashMap<>();
    // If a shard is small, it may not contain an index and we can detect this and
    // prepopulate the shard index map with an empty entry if the start of the index
    // and start of the next block are equal
    Iterator<IsmShard> shardIterator = shardOffsetToShardMap.values().iterator();
    // If file is empty we just return here.
    if (!shardIterator.hasNext()) {
        return Optional.of(rawChannel);
    }
    // If the current shard's index position is equal to the next shards block offset
    // then we know that the index contains no data and we can pre-populate it with
    // the empty map.
    IsmShard currentShard = shardIterator.next();
    while (shardIterator.hasNext()) {
        IsmShard nextShard = shardIterator.next();
        if (currentShard.getIndexOffset() == nextShard.getBlockOffset()) {
            indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
        }
        currentShard = nextShard;
    }
    // start of the Bloom filter, then we know that the index is empty.
    if (currentShard.getIndexOffset() == footer.getBloomFilterPosition()) {
        indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
    }
    return Optional.of(rawChannel);
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)

Example 3 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderImpl method overKeyComponents.

@Override
public IsmPrefixReaderIterator overKeyComponents(List<?> keyComponents) throws IOException {
    if (keyComponents.isEmpty()) {
        return overKeyComponents(keyComponents, 0, new RandomAccessData(0));
    }
    RandomAccessData keyBytes = new RandomAccessData();
    int shardId = coder.encodeAndHash(keyComponents, keyBytes);
    return overKeyComponents(keyComponents, shardId, keyBytes);
}
Also used : RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData)

Example 4 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderTest method testInitializationForSmallFilesIsCached.

@Test
public void testInitializationForSmallFilesIsCached() throws Exception {
    File tmpFile = tmpFolder.newFile();
    IsmShardKey expectedShardKey = new IsmShardKey(tmpFile.getAbsolutePath(), new RandomAccessData(0), 0, 13);
    List<IsmRecord<byte[]>> data = new ArrayList<>();
    data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x04 }), new byte[] { 0x04 }));
    data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x08 }), new byte[] { 0x08 }));
    writeElementsToFile(data, tmpFile);
    IsmReader<byte[]> reader = new IsmReaderImpl<byte[]>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
    // Validate that reader and cache are in initial state
    assertFalse(reader.isInitialized());
    assertEquals(0, cache.size());
    // Force initialization
    reader.overKeyComponents(ImmutableList.of());
    // Validate reader is initialized and expected entry is cached
    assertTrue(reader.isInitialized());
    WeightedValue<NavigableMap<RandomAccessData, WindowedValue<IsmRecord<byte[]>>>> block = cache.getIfPresent(expectedShardKey);
    assertNotNull(block);
    assertArrayEquals(new byte[] { 0x04 }, block.getValue().firstEntry().getValue().getValue().getValue());
    assertArrayEquals(new byte[] { 0x08 }, block.getValue().lastEntry().getValue().getValue().getValue());
}
Also used : NavigableMap(java.util.NavigableMap) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmShardKey(org.apache.beam.runners.dataflow.worker.IsmReaderImpl.IsmShardKey) ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) File(java.io.File) Test(org.junit.Test)

Example 5 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmSideInputReader method findAndStartReaders.

/**
 * Returns a list of reader iterators over the provided key components. Each reader iterator
 * within the returned list is guaranteed to have at least one element and will be in a state
 * where {@link NativeReader.NativeReaderIterator#start} has already been called.
 */
private <V> List<IsmReader<V>.IsmPrefixReaderIterator> findAndStartReaders(List<IsmReader<V>> readers, final List<?> keyComponents) throws IOException {
    if (readers.isEmpty()) {
        return Collections.emptyList();
    }
    RandomAccessData keyBytes = new RandomAccessData();
    int shardId = readers.get(0).getCoder().encodeAndHash(keyComponents, keyBytes);
    List<IsmReader<V>.IsmPrefixReaderIterator> readerIterators = new ArrayList<>();
    for (final IsmReader<V> reader : readers) {
        IsmReader<V>.IsmPrefixReaderIterator readerIterator = reader.overKeyComponents(keyComponents, shardId, keyBytes);
        if (readerIterator.start()) {
            readerIterators.add(readerIterator);
        }
    }
    return readerIterators;
}
Also used : KV(org.apache.beam.sdk.values.KV) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) ArrayList(java.util.ArrayList)

Aggregations

RandomAccessData (org.apache.beam.runners.dataflow.util.RandomAccessData)12 SeekableByteChannel (java.nio.channels.SeekableByteChannel)5 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)5 File (java.io.File)3 ArrayList (java.util.ArrayList)3 IsmShard (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)3 WindowedValue (org.apache.beam.sdk.util.WindowedValue)3 Test (org.junit.Test)3 Closeable (java.io.Closeable)2 HashMap (java.util.HashMap)2 NavigableMap (java.util.NavigableMap)2 SortedMap (java.util.SortedMap)2 IsmShardKey (org.apache.beam.runners.dataflow.worker.IsmReaderImpl.IsmShardKey)2 WeightedValue (org.apache.beam.sdk.util.WeightedValue)2 KV (org.apache.beam.sdk.values.KV)2 ImmutableSortedMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap)2 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 InputStream (java.io.InputStream)1 Map (java.util.Map)1 IsmRecordCoder (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder)1