use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.
the class IsmReaderImpl method initializeBloomFilterAndIndexPerShard.
/**
* Initializes the Bloom filter and index per shard. We prepopulate empty indices for shards where
* the index offset matches the following shard block offset. Re-uses the provided channel,
* returning it or a new one if this method was required to open one.
*/
private synchronized Optional<SeekableByteChannel> initializeBloomFilterAndIndexPerShard(Optional<SeekableByteChannel> inChannel) throws IOException {
if (indexPerShard != null) {
checkState(bloomFilter != null, "Expected Bloom filter to have been initialized.");
return inChannel;
}
SeekableByteChannel rawChannel = openIfNeeded(inChannel);
// Set the position to where the bloom filter is and read it in.
position(rawChannel, footer.getBloomFilterPosition());
bloomFilter = ScalableBloomFilterCoder.of().decode(Channels.newInputStream(rawChannel));
indexPerShard = new HashMap<>();
// If a shard is small, it may not contain an index and we can detect this and
// prepopulate the shard index map with an empty entry if the start of the index
// and start of the next block are equal
Iterator<IsmShard> shardIterator = shardOffsetToShardMap.values().iterator();
// If file is empty we just return here.
if (!shardIterator.hasNext()) {
return Optional.of(rawChannel);
}
// If the current shard's index position is equal to the next shards block offset
// then we know that the index contains no data and we can pre-populate it with
// the empty map.
IsmShard currentShard = shardIterator.next();
while (shardIterator.hasNext()) {
IsmShard nextShard = shardIterator.next();
if (currentShard.getIndexOffset() == nextShard.getBlockOffset()) {
indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
}
currentShard = nextShard;
}
// start of the Bloom filter, then we know that the index is empty.
if (currentShard.getIndexOffset() == footer.getBloomFilterPosition()) {
indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
}
return Optional.of(rawChannel);
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.
the class IsmReaderImpl method initializeFooterAndShardIndex.
/**
* Initialize this Ism reader by reading the footer and shard index. Returns a channel for re-use
* if this method was required to open one.
*/
private synchronized Optional<SeekableByteChannel> initializeFooterAndShardIndex(Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
if (footer != null) {
checkState(shardIdToShardMap != null, "Expected shard id to shard map to have been initialized.");
checkState(shardOffsetToShardMap != null, "Expected shard offset to shard map to have been initialized.");
return inChannel;
}
checkState(shardIdToShardMap == null, "Expected shard id to shard map to not have been initialized.");
checkState(shardOffsetToShardMap == null, "Expected shard offset to shard map to not have been initialized.");
SeekableByteChannel rawChannel;
RandomAccessData data;
long startPosition;
try (Closeable closeReadCounter = readCounter.enter()) {
rawChannel = openIfNeeded(inChannel);
this.length = rawChannel.size();
// We read the last chunk of data, for small files we will capture the entire file.
// We may capture the Bloom filter, shard index, and footer for slightly larger files.
// Otherwise we are guaranteed to capture the footer and the shard index.
startPosition = Math.max(length - MAX_SHARD_INDEX_AND_FOOTER_SIZE, 0);
position(rawChannel, startPosition);
data = new RandomAccessData(ByteStreams.toByteArray(Channels.newInputStream(rawChannel)));
}
readCounter.addBytesRead(data.size());
// Read the fixed length footer.
this.footer = FooterCoder.of().decode(data.asInputStream(data.size() - Footer.FIXED_LENGTH, Footer.FIXED_LENGTH));
checkState(startPosition < footer.getIndexPosition(), "Malformed file, expected to have been able to read entire shard index.");
int offsetWithinReadData = (int) (footer.getIndexPosition() - startPosition);
// Decode the list of Ism shard descriptors
List<IsmShard> ismShards = IsmFormat.ISM_SHARD_INDEX_CODER.decode(data.asInputStream(offsetWithinReadData, data.size() - offsetWithinReadData));
// Build the shard id to shard descriptor map
ImmutableSortedMap.Builder<Integer, IsmShard> shardIdToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Integer>natural());
for (IsmShard ismShard : ismShards) {
shardIdToShardMapBuilder.put(ismShard.getId(), ismShard);
}
shardIdToShardMap = shardIdToShardMapBuilder.build();
// Build the shard block offset to shard descriptor map
ImmutableSortedMap.Builder<Long, IsmShard> shardOffsetToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Long>natural());
for (IsmShard ismShard : ismShards) {
shardOffsetToShardMapBuilder.put(ismShard.getBlockOffset(), ismShard);
}
shardOffsetToShardMap = shardOffsetToShardMapBuilder.build();
// We may have gotten the Bloom filter, if so lets store it.
if (startPosition < footer.getBloomFilterPosition()) {
Optional<SeekableByteChannel> cachedDataChannel = Optional.<SeekableByteChannel>of(new CachedTailSeekableByteChannel(startPosition, data.array()));
initializeBloomFilterAndIndexPerShard(cachedDataChannel);
// case since the IsmSideInputReader does initialization in parallel.
if (cache != null && startPosition == 0) {
for (IsmShard ismShard : ismShards) {
initializeForKeyedRead(ismShard.getId(), cachedDataChannel, readCounter);
}
for (SortedMap<RandomAccessData, IsmShardKey> shards : indexPerShard.values()) {
for (Map.Entry<RandomAccessData, IsmShardKey> block : shards.entrySet()) {
cache.put(block.getValue(), new IsmCacheLoader(block.getValue()).call(cachedDataChannel.get()));
}
}
}
}
return Optional.of(rawChannel);
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.
the class IsmReaderImpl method initializeForKeyedRead.
/**
* Initializes the footer, shard index, Bloom filter and index for the requested shard id if they
* have not been initialized yet. Re-uses the provided channel, returning it or a new one if this
* method was required to open one.
*/
// Real bug - https://issues.apache.org/jira/browse/BEAM-6559
@SuppressFBWarnings("NP_NULL_ON_SOME_PATH")
private Optional<SeekableByteChannel> initializeForKeyedRead(int shardId, Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
inChannel = initializeFooterAndShardIndex(inChannel, readCounter);
IsmShard shardWithIndex = shardIdToShardMap.get(shardId);
// If this shard id is not within this file, we can return immediately.
if (shardWithIndex == null) {
return inChannel;
}
inChannel = initializeBloomFilterAndIndexPerShard(inChannel);
// If the index has been populated and contains the shard id, we can return.
if (indexPerShard != null && indexPerShard.containsKey(shardId)) {
checkState(bloomFilter != null, "Bloom filter expected to have been initialized.");
return inChannel;
}
checkState(indexPerShard.get(shardId) == null, "Expected to not have initialized index for shard %s", shardId);
Long startOfNextBlock = shardOffsetToShardMap.higherKey(shardWithIndex.getBlockOffset());
// as the upper bound.
if (startOfNextBlock == null) {
startOfNextBlock = footer.getBloomFilterPosition();
}
// Open the channel if needed and seek to the start of the index.
SeekableByteChannel rawChannel = openIfNeeded(inChannel);
rawChannel.position(shardWithIndex.getIndexOffset());
InputStream inStream = Channels.newInputStream(rawChannel);
ImmutableSortedMap.Builder<RandomAccessData, IsmShardKey> builder = ImmutableSortedMap.orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR);
// Read the first key
RandomAccessData currentKeyBytes = new RandomAccessData();
readKey(inStream, currentKeyBytes);
long currentOffset = VarInt.decodeLong(inStream);
// Insert the entry that happens at the beginning limiting the shard block by the
// first keys block offset.
builder.put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), shardWithIndex.getBlockOffset(), currentOffset));
// that limit the range of the shard block.
while (rawChannel.position() < startOfNextBlock) {
RandomAccessData nextKeyBytes = currentKeyBytes.copy();
readKey(inStream, nextKeyBytes);
long nextOffset = VarInt.decodeLong(inStream);
builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, nextOffset));
currentKeyBytes = nextKeyBytes;
currentOffset = nextOffset;
}
// Upper bound the last entry with the index offset.
builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, shardWithIndex.getIndexOffset()));
indexPerShard.put(shardId, builder.build());
return Optional.of(rawChannel);
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.
the class IsmFormatTest method testIsmShardToStringEqualsAndHashCode.
@Test
public void testIsmShardToStringEqualsAndHashCode() {
IsmShard shardA = IsmShard.of(1, 2, 3);
IsmShard shardB = IsmShard.of(1, 2, 3);
IsmShard shardC = IsmShard.of(4, 5, 6);
assertEquals(shardA, shardB);
assertNotEquals(shardA, shardC);
assertEquals(shardA.hashCode(), shardB.hashCode());
assertNotEquals(shardA.hashCode(), shardC.hashCode());
assertThat(shardA.toString(), allOf(containsString("id=1"), containsString("blockOffset=2"), containsString("indexOffset=3")));
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.
the class IsmFormatTest method testIsmShardCoder.
@Test
public void testIsmShardCoder() throws Exception {
IsmShard shardA = IsmShard.of(1, 2, 3);
IsmShard shardB = IsmShard.of(1, 2, 3);
CoderProperties.coderDecodeEncodeEqual(IsmShardCoder.of(), shardA);
CoderProperties.coderDeterministic(IsmShardCoder.of(), shardA, shardB);
CoderProperties.coderConsistentWithEquals(IsmShardCoder.of(), shardA, shardB);
CoderProperties.coderSerializable(IsmShardCoder.of());
CoderProperties.structuralValueConsistentWithEquals(IsmShardCoder.of(), shardA, shardB);
}
Aggregations