use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderImpl method overKeyComponents.
@Override
public IsmPrefixReaderIterator overKeyComponents(List<?> keyComponents, int shardId, RandomAccessData keyBytes) throws IOException {
checkNotNull(keyComponents);
checkNotNull(keyBytes);
SideInputReadCounter readCounter = IsmReader.getCurrentSideInputCounter();
if (keyComponents.isEmpty()) {
checkArgument(shardId == 0 && keyBytes.size() == 0, "Expected shard id to be 0 and key bytes to be empty " + "but got shard id %s and key bytes of length %s", shardId, keyBytes.size());
}
checkArgument(keyComponents.size() <= coder.getKeyComponentCoders().size(), "Expected at most %s key component(s) but received %s.", coder.getKeyComponentCoders().size(), keyComponents);
Optional<SeekableByteChannel> inChannel = initializeFooterAndShardIndex(Optional.<SeekableByteChannel>absent(), readCounter);
// If this file is empty, we can return an empty iterator.
if (footer.getNumberOfKeys() == 0) {
return new EmptyIsmPrefixReaderIterator(keyComponents);
}
// iterator over all the keys.
if (keyComponents.size() < coder.getNumberOfShardKeyCoders(keyComponents)) {
return new ShardAwareIsmPrefixReaderIterator(keyComponents, openIfNeeded(inChannel), readCounter);
}
// we know that we can return an empty reader iterator.
if (!shardIdToShardMap.containsKey(shardId)) {
return new EmptyIsmPrefixReaderIterator(keyComponents);
}
inChannel = initializeForKeyedRead(shardId, inChannel, readCounter);
closeIfPresent(inChannel);
if (!bloomFilterMightContain(keyBytes)) {
return new EmptyIsmPrefixReaderIterator(keyComponents);
}
// Otherwise we may actually contain the key so construct a reader iterator
// which will fetch the data blocks containing the requested key prefix.
// We find the first key in the index which may contain our prefix
RandomAccessData floorKey = indexPerShard.get(shardId).floorKey(keyBytes);
// We compute an upper bound on the key prefix by incrementing the prefix
RandomAccessData keyBytesUpperBound = keyBytes.increment();
// Compute the sub-range of the index map that we want to iterate over since
// any of these blocks may contain the key prefix.
Iterator<IsmShardKey> blockEntries = indexPerShard.get(shardId).subMap(floorKey, keyBytesUpperBound).values().iterator();
return new WithinShardIsmPrefixReaderIterator(keyComponents, keyBytes, keyBytesUpperBound, blockEntries, readCounter);
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderImpl method initializeBloomFilterAndIndexPerShard.
/**
* Initializes the Bloom filter and index per shard. We prepopulate empty indices for shards where
* the index offset matches the following shard block offset. Re-uses the provided channel,
* returning it or a new one if this method was required to open one.
*/
private synchronized Optional<SeekableByteChannel> initializeBloomFilterAndIndexPerShard(Optional<SeekableByteChannel> inChannel) throws IOException {
if (indexPerShard != null) {
checkState(bloomFilter != null, "Expected Bloom filter to have been initialized.");
return inChannel;
}
SeekableByteChannel rawChannel = openIfNeeded(inChannel);
// Set the position to where the bloom filter is and read it in.
position(rawChannel, footer.getBloomFilterPosition());
bloomFilter = ScalableBloomFilterCoder.of().decode(Channels.newInputStream(rawChannel));
indexPerShard = new HashMap<>();
// If a shard is small, it may not contain an index and we can detect this and
// prepopulate the shard index map with an empty entry if the start of the index
// and start of the next block are equal
Iterator<IsmShard> shardIterator = shardOffsetToShardMap.values().iterator();
// If file is empty we just return here.
if (!shardIterator.hasNext()) {
return Optional.of(rawChannel);
}
// If the current shard's index position is equal to the next shards block offset
// then we know that the index contains no data and we can pre-populate it with
// the empty map.
IsmShard currentShard = shardIterator.next();
while (shardIterator.hasNext()) {
IsmShard nextShard = shardIterator.next();
if (currentShard.getIndexOffset() == nextShard.getBlockOffset()) {
indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
}
currentShard = nextShard;
}
// start of the Bloom filter, then we know that the index is empty.
if (currentShard.getIndexOffset() == footer.getBloomFilterPosition()) {
indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
}
return Optional.of(rawChannel);
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderImpl method overKeyComponents.
@Override
public IsmPrefixReaderIterator overKeyComponents(List<?> keyComponents) throws IOException {
if (keyComponents.isEmpty()) {
return overKeyComponents(keyComponents, 0, new RandomAccessData(0));
}
RandomAccessData keyBytes = new RandomAccessData();
int shardId = coder.encodeAndHash(keyComponents, keyBytes);
return overKeyComponents(keyComponents, shardId, keyBytes);
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderTest method testInitializationForSmallFilesIsCached.
@Test
public void testInitializationForSmallFilesIsCached() throws Exception {
File tmpFile = tmpFolder.newFile();
IsmShardKey expectedShardKey = new IsmShardKey(tmpFile.getAbsolutePath(), new RandomAccessData(0), 0, 13);
List<IsmRecord<byte[]>> data = new ArrayList<>();
data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x04 }), new byte[] { 0x04 }));
data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x08 }), new byte[] { 0x08 }));
writeElementsToFile(data, tmpFile);
IsmReader<byte[]> reader = new IsmReaderImpl<byte[]>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
// Validate that reader and cache are in initial state
assertFalse(reader.isInitialized());
assertEquals(0, cache.size());
// Force initialization
reader.overKeyComponents(ImmutableList.of());
// Validate reader is initialized and expected entry is cached
assertTrue(reader.isInitialized());
WeightedValue<NavigableMap<RandomAccessData, WindowedValue<IsmRecord<byte[]>>>> block = cache.getIfPresent(expectedShardKey);
assertNotNull(block);
assertArrayEquals(new byte[] { 0x04 }, block.getValue().firstEntry().getValue().getValue().getValue());
assertArrayEquals(new byte[] { 0x08 }, block.getValue().lastEntry().getValue().getValue().getValue());
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmSideInputReader method findAndStartReaders.
/**
* Returns a list of reader iterators over the provided key components. Each reader iterator
* within the returned list is guaranteed to have at least one element and will be in a state
* where {@link NativeReader.NativeReaderIterator#start} has already been called.
*/
private <V> List<IsmReader<V>.IsmPrefixReaderIterator> findAndStartReaders(List<IsmReader<V>> readers, final List<?> keyComponents) throws IOException {
if (readers.isEmpty()) {
return Collections.emptyList();
}
RandomAccessData keyBytes = new RandomAccessData();
int shardId = readers.get(0).getCoder().encodeAndHash(keyComponents, keyBytes);
List<IsmReader<V>.IsmPrefixReaderIterator> readerIterators = new ArrayList<>();
for (final IsmReader<V> reader : readers) {
IsmReader<V>.IsmPrefixReaderIterator readerIterator = reader.overKeyComponents(keyComponents, shardId, keyBytes);
if (readerIterator.start()) {
readerIterators.add(readerIterator);
}
}
return readerIterators;
}
Aggregations