use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderFactory method createImpl.
<V> NativeReader<?> createImpl(CloudObject spec, Coder<?> coder, PipelineOptions options, DataflowExecutionContext executionContext, DataflowOperationContext operationContext) throws Exception {
final ResourceId resourceId = FileSystems.matchNewResource(getString(spec, WorkerPropertyNames.FILENAME), false);
checkArgument(coder instanceof WindowedValueCoder, "%s only supports using %s but got %s.", IsmReader.class, WindowedValueCoder.class, coder);
@SuppressWarnings("unchecked") WindowedValueCoder<IsmRecord<V>> windowedCoder = (WindowedValueCoder<IsmRecord<V>>) coder;
checkArgument(windowedCoder.getValueCoder() instanceof IsmRecordCoder, "%s only supports using %s but got %s.", IsmReader.class, IsmRecordCoder.class, windowedCoder.getValueCoder());
@SuppressWarnings("unchecked") final IsmRecordCoder<V> ismCoder = (IsmRecordCoder<V>) windowedCoder.getValueCoder();
checkArgument(executionContext instanceof BatchModeExecutionContext, "%s only supports using %s but got %s.", IsmReader.class, BatchModeExecutionContext.class, executionContext);
final BatchModeExecutionContext execContext = (BatchModeExecutionContext) executionContext;
// the same file.
return execContext.<IsmReaderKey, NativeReader<?>>getLogicalReferenceCache().get(new IsmReaderKey(resourceId.toString()), () -> new IsmReaderImpl<V>(resourceId, ismCoder, execContext.<IsmReaderImpl.IsmShardKey, WeightedValue<NavigableMap<RandomAccessData, WindowedValue<IsmRecord<V>>>>>getDataCache()));
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderImpl method getBlock.
/**
* Returns a map from key to value, where the keys are in increasing lexicographical order. If the
* requested key is not contained within this file, an empty map is returned.
*/
private NavigableMap<RandomAccessData, WindowedValue<IsmRecord<V>>> getBlock(RandomAccessData keyBytes, int shardId, SideInputReadCounter readCounter) throws IOException {
Optional<SeekableByteChannel> inChannel = initializeFooterAndShardIndex(Optional.<SeekableByteChannel>absent(), readCounter);
// Key is not stored here so return an empty map.
if (!shardIdToShardMap.containsKey(shardId) || !bloomFilterMightContain(keyBytes)) {
return ImmutableSortedMap.<RandomAccessData, WindowedValue<IsmRecord<V>>>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).build();
}
inChannel = initializeForKeyedRead(shardId, inChannel, readCounter);
closeIfPresent(inChannel);
final NavigableMap<RandomAccessData, IsmShardKey> indexInShard = indexPerShard.get(shardId);
final IsmShardKey cacheEntry = indexInShard.floorEntry(keyBytes).getValue();
try (Closeable readerCloseable = IsmReader.setSideInputReadContext(readCounter)) {
return fetch(cacheEntry);
}
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderImpl method initializeFooterAndShardIndex.
/**
* Initialize this Ism reader by reading the footer and shard index. Returns a channel for re-use
* if this method was required to open one.
*/
private synchronized Optional<SeekableByteChannel> initializeFooterAndShardIndex(Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
if (footer != null) {
checkState(shardIdToShardMap != null, "Expected shard id to shard map to have been initialized.");
checkState(shardOffsetToShardMap != null, "Expected shard offset to shard map to have been initialized.");
return inChannel;
}
checkState(shardIdToShardMap == null, "Expected shard id to shard map to not have been initialized.");
checkState(shardOffsetToShardMap == null, "Expected shard offset to shard map to not have been initialized.");
SeekableByteChannel rawChannel;
RandomAccessData data;
long startPosition;
try (Closeable closeReadCounter = readCounter.enter()) {
rawChannel = openIfNeeded(inChannel);
this.length = rawChannel.size();
// We read the last chunk of data, for small files we will capture the entire file.
// We may capture the Bloom filter, shard index, and footer for slightly larger files.
// Otherwise we are guaranteed to capture the footer and the shard index.
startPosition = Math.max(length - MAX_SHARD_INDEX_AND_FOOTER_SIZE, 0);
position(rawChannel, startPosition);
data = new RandomAccessData(ByteStreams.toByteArray(Channels.newInputStream(rawChannel)));
}
readCounter.addBytesRead(data.size());
// Read the fixed length footer.
this.footer = FooterCoder.of().decode(data.asInputStream(data.size() - Footer.FIXED_LENGTH, Footer.FIXED_LENGTH));
checkState(startPosition < footer.getIndexPosition(), "Malformed file, expected to have been able to read entire shard index.");
int offsetWithinReadData = (int) (footer.getIndexPosition() - startPosition);
// Decode the list of Ism shard descriptors
List<IsmShard> ismShards = IsmFormat.ISM_SHARD_INDEX_CODER.decode(data.asInputStream(offsetWithinReadData, data.size() - offsetWithinReadData));
// Build the shard id to shard descriptor map
ImmutableSortedMap.Builder<Integer, IsmShard> shardIdToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Integer>natural());
for (IsmShard ismShard : ismShards) {
shardIdToShardMapBuilder.put(ismShard.getId(), ismShard);
}
shardIdToShardMap = shardIdToShardMapBuilder.build();
// Build the shard block offset to shard descriptor map
ImmutableSortedMap.Builder<Long, IsmShard> shardOffsetToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Long>natural());
for (IsmShard ismShard : ismShards) {
shardOffsetToShardMapBuilder.put(ismShard.getBlockOffset(), ismShard);
}
shardOffsetToShardMap = shardOffsetToShardMapBuilder.build();
// We may have gotten the Bloom filter, if so lets store it.
if (startPosition < footer.getBloomFilterPosition()) {
Optional<SeekableByteChannel> cachedDataChannel = Optional.<SeekableByteChannel>of(new CachedTailSeekableByteChannel(startPosition, data.array()));
initializeBloomFilterAndIndexPerShard(cachedDataChannel);
// case since the IsmSideInputReader does initialization in parallel.
if (cache != null && startPosition == 0) {
for (IsmShard ismShard : ismShards) {
initializeForKeyedRead(ismShard.getId(), cachedDataChannel, readCounter);
}
for (SortedMap<RandomAccessData, IsmShardKey> shards : indexPerShard.values()) {
for (Map.Entry<RandomAccessData, IsmShardKey> block : shards.entrySet()) {
cache.put(block.getValue(), new IsmCacheLoader(block.getValue()).call(cachedDataChannel.get()));
}
}
}
}
return Optional.of(rawChannel);
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmReaderImpl method initializeForKeyedRead.
/**
* Initializes the footer, shard index, Bloom filter and index for the requested shard id if they
* have not been initialized yet. Re-uses the provided channel, returning it or a new one if this
* method was required to open one.
*/
// Real bug - https://issues.apache.org/jira/browse/BEAM-6559
@SuppressFBWarnings("NP_NULL_ON_SOME_PATH")
private Optional<SeekableByteChannel> initializeForKeyedRead(int shardId, Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
inChannel = initializeFooterAndShardIndex(inChannel, readCounter);
IsmShard shardWithIndex = shardIdToShardMap.get(shardId);
// If this shard id is not within this file, we can return immediately.
if (shardWithIndex == null) {
return inChannel;
}
inChannel = initializeBloomFilterAndIndexPerShard(inChannel);
// If the index has been populated and contains the shard id, we can return.
if (indexPerShard != null && indexPerShard.containsKey(shardId)) {
checkState(bloomFilter != null, "Bloom filter expected to have been initialized.");
return inChannel;
}
checkState(indexPerShard.get(shardId) == null, "Expected to not have initialized index for shard %s", shardId);
Long startOfNextBlock = shardOffsetToShardMap.higherKey(shardWithIndex.getBlockOffset());
// as the upper bound.
if (startOfNextBlock == null) {
startOfNextBlock = footer.getBloomFilterPosition();
}
// Open the channel if needed and seek to the start of the index.
SeekableByteChannel rawChannel = openIfNeeded(inChannel);
rawChannel.position(shardWithIndex.getIndexOffset());
InputStream inStream = Channels.newInputStream(rawChannel);
ImmutableSortedMap.Builder<RandomAccessData, IsmShardKey> builder = ImmutableSortedMap.orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR);
// Read the first key
RandomAccessData currentKeyBytes = new RandomAccessData();
readKey(inStream, currentKeyBytes);
long currentOffset = VarInt.decodeLong(inStream);
// Insert the entry that happens at the beginning limiting the shard block by the
// first keys block offset.
builder.put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), shardWithIndex.getBlockOffset(), currentOffset));
// that limit the range of the shard block.
while (rawChannel.position() < startOfNextBlock) {
RandomAccessData nextKeyBytes = currentKeyBytes.copy();
readKey(inStream, nextKeyBytes);
long nextOffset = VarInt.decodeLong(inStream);
builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, nextOffset));
currentKeyBytes = nextKeyBytes;
currentOffset = nextOffset;
}
// Upper bound the last entry with the index offset.
builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, shardWithIndex.getIndexOffset()));
indexPerShard.put(shardId, builder.build());
return Optional.of(rawChannel);
}
use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.
the class IsmSideInputReaderTest method initInputFile.
/**
* Write input elements to the given file and return the corresponding IsmSource.
*/
private <K, V> Source initInputFile(Iterable<IsmRecord<WindowedValue<V>>> elements, IsmRecordCoder<WindowedValue<V>> coder, String tmpFilePath) throws Exception {
// Group the keys by shard and sort the values within a shard by the composite key.
Map<Integer, SortedMap<RandomAccessData, IsmRecord<WindowedValue<V>>>> writeOrder = new HashMap<>();
for (IsmRecord<WindowedValue<V>> element : elements) {
int shardId = coder.hash(element.getKeyComponents());
if (!writeOrder.containsKey(shardId)) {
writeOrder.put(shardId, new TreeMap<RandomAccessData, IsmRecord<WindowedValue<V>>>(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR));
}
RandomAccessData data = encodeKeyPortion(coder, element);
writeOrder.get(shardId).put(data, element);
}
IsmSink<WindowedValue<V>> sink = new IsmSink<>(FileSystems.matchNewResource(tmpFilePath, false), coder, BLOOM_FILTER_SIZE_LIMIT);
try (SinkWriter<WindowedValue<IsmRecord<WindowedValue<V>>>> writer = sink.writer()) {
for (Entry<Integer, SortedMap<RandomAccessData, IsmRecord<WindowedValue<V>>>> entry : writeOrder.entrySet()) {
for (IsmRecord<WindowedValue<V>> record : entry.getValue().values()) {
writer.add(new ValueInEmptyWindows<>(record));
}
}
}
return newIsmSource(coder, tmpFilePath);
}
Aggregations