Search in sources :

Example 6 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderFactory method createImpl.

<V> NativeReader<?> createImpl(CloudObject spec, Coder<?> coder, PipelineOptions options, DataflowExecutionContext executionContext, DataflowOperationContext operationContext) throws Exception {
    final ResourceId resourceId = FileSystems.matchNewResource(getString(spec, WorkerPropertyNames.FILENAME), false);
    checkArgument(coder instanceof WindowedValueCoder, "%s only supports using %s but got %s.", IsmReader.class, WindowedValueCoder.class, coder);
    @SuppressWarnings("unchecked") WindowedValueCoder<IsmRecord<V>> windowedCoder = (WindowedValueCoder<IsmRecord<V>>) coder;
    checkArgument(windowedCoder.getValueCoder() instanceof IsmRecordCoder, "%s only supports using %s but got %s.", IsmReader.class, IsmRecordCoder.class, windowedCoder.getValueCoder());
    @SuppressWarnings("unchecked") final IsmRecordCoder<V> ismCoder = (IsmRecordCoder<V>) windowedCoder.getValueCoder();
    checkArgument(executionContext instanceof BatchModeExecutionContext, "%s only supports using %s but got %s.", IsmReader.class, BatchModeExecutionContext.class, executionContext);
    final BatchModeExecutionContext execContext = (BatchModeExecutionContext) executionContext;
    // the same file.
    return execContext.<IsmReaderKey, NativeReader<?>>getLogicalReferenceCache().get(new IsmReaderKey(resourceId.toString()), () -> new IsmReaderImpl<V>(resourceId, ismCoder, execContext.<IsmReaderImpl.IsmShardKey, WeightedValue<NavigableMap<RandomAccessData, WindowedValue<IsmRecord<V>>>>>getDataCache()));
}
Also used : RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) WeightedValue(org.apache.beam.sdk.util.WeightedValue) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Example 7 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderImpl method getBlock.

/**
 * Returns a map from key to value, where the keys are in increasing lexicographical order. If the
 * requested key is not contained within this file, an empty map is returned.
 */
private NavigableMap<RandomAccessData, WindowedValue<IsmRecord<V>>> getBlock(RandomAccessData keyBytes, int shardId, SideInputReadCounter readCounter) throws IOException {
    Optional<SeekableByteChannel> inChannel = initializeFooterAndShardIndex(Optional.<SeekableByteChannel>absent(), readCounter);
    // Key is not stored here so return an empty map.
    if (!shardIdToShardMap.containsKey(shardId) || !bloomFilterMightContain(keyBytes)) {
        return ImmutableSortedMap.<RandomAccessData, WindowedValue<IsmRecord<V>>>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).build();
    }
    inChannel = initializeForKeyedRead(shardId, inChannel, readCounter);
    closeIfPresent(inChannel);
    final NavigableMap<RandomAccessData, IsmShardKey> indexInShard = indexPerShard.get(shardId);
    final IsmShardKey cacheEntry = indexInShard.floorEntry(keyBytes).getValue();
    try (Closeable readerCloseable = IsmReader.setSideInputReadContext(readCounter)) {
        return fetch(cacheEntry);
    }
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) Closeable(java.io.Closeable) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)

Example 8 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderImpl method initializeFooterAndShardIndex.

/**
 * Initialize this Ism reader by reading the footer and shard index. Returns a channel for re-use
 * if this method was required to open one.
 */
private synchronized Optional<SeekableByteChannel> initializeFooterAndShardIndex(Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
    if (footer != null) {
        checkState(shardIdToShardMap != null, "Expected shard id to shard map to have been initialized.");
        checkState(shardOffsetToShardMap != null, "Expected shard offset to shard map to have been initialized.");
        return inChannel;
    }
    checkState(shardIdToShardMap == null, "Expected shard id to shard map to not have been initialized.");
    checkState(shardOffsetToShardMap == null, "Expected shard offset to shard map to not have been initialized.");
    SeekableByteChannel rawChannel;
    RandomAccessData data;
    long startPosition;
    try (Closeable closeReadCounter = readCounter.enter()) {
        rawChannel = openIfNeeded(inChannel);
        this.length = rawChannel.size();
        // We read the last chunk of data, for small files we will capture the entire file.
        // We may capture the Bloom filter, shard index, and footer for slightly larger files.
        // Otherwise we are guaranteed to capture the footer and the shard index.
        startPosition = Math.max(length - MAX_SHARD_INDEX_AND_FOOTER_SIZE, 0);
        position(rawChannel, startPosition);
        data = new RandomAccessData(ByteStreams.toByteArray(Channels.newInputStream(rawChannel)));
    }
    readCounter.addBytesRead(data.size());
    // Read the fixed length footer.
    this.footer = FooterCoder.of().decode(data.asInputStream(data.size() - Footer.FIXED_LENGTH, Footer.FIXED_LENGTH));
    checkState(startPosition < footer.getIndexPosition(), "Malformed file, expected to have been able to read entire shard index.");
    int offsetWithinReadData = (int) (footer.getIndexPosition() - startPosition);
    // Decode the list of Ism shard descriptors
    List<IsmShard> ismShards = IsmFormat.ISM_SHARD_INDEX_CODER.decode(data.asInputStream(offsetWithinReadData, data.size() - offsetWithinReadData));
    // Build the shard id to shard descriptor map
    ImmutableSortedMap.Builder<Integer, IsmShard> shardIdToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Integer>natural());
    for (IsmShard ismShard : ismShards) {
        shardIdToShardMapBuilder.put(ismShard.getId(), ismShard);
    }
    shardIdToShardMap = shardIdToShardMapBuilder.build();
    // Build the shard block offset to shard descriptor map
    ImmutableSortedMap.Builder<Long, IsmShard> shardOffsetToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Long>natural());
    for (IsmShard ismShard : ismShards) {
        shardOffsetToShardMapBuilder.put(ismShard.getBlockOffset(), ismShard);
    }
    shardOffsetToShardMap = shardOffsetToShardMapBuilder.build();
    // We may have gotten the Bloom filter, if so lets store it.
    if (startPosition < footer.getBloomFilterPosition()) {
        Optional<SeekableByteChannel> cachedDataChannel = Optional.<SeekableByteChannel>of(new CachedTailSeekableByteChannel(startPosition, data.array()));
        initializeBloomFilterAndIndexPerShard(cachedDataChannel);
        // case since the IsmSideInputReader does initialization in parallel.
        if (cache != null && startPosition == 0) {
            for (IsmShard ismShard : ismShards) {
                initializeForKeyedRead(ismShard.getId(), cachedDataChannel, readCounter);
            }
            for (SortedMap<RandomAccessData, IsmShardKey> shards : indexPerShard.values()) {
                for (Map.Entry<RandomAccessData, IsmShardKey> block : shards.entrySet()) {
                    cache.put(block.getValue(), new IsmCacheLoader(block.getValue()).call(cachedDataChannel.get()));
                }
            }
        }
    }
    return Optional.of(rawChannel);
}
Also used : RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) Closeable(java.io.Closeable) ImmutableSortedMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap) SeekableByteChannel(java.nio.channels.SeekableByteChannel) Map(java.util.Map) NavigableMap(java.util.NavigableMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) ImmutableSortedMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap) IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)

Example 9 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmReaderImpl method initializeForKeyedRead.

/**
 * Initializes the footer, shard index, Bloom filter and index for the requested shard id if they
 * have not been initialized yet. Re-uses the provided channel, returning it or a new one if this
 * method was required to open one.
 */
// Real bug - https://issues.apache.org/jira/browse/BEAM-6559
@SuppressFBWarnings("NP_NULL_ON_SOME_PATH")
private Optional<SeekableByteChannel> initializeForKeyedRead(int shardId, Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
    inChannel = initializeFooterAndShardIndex(inChannel, readCounter);
    IsmShard shardWithIndex = shardIdToShardMap.get(shardId);
    // If this shard id is not within this file, we can return immediately.
    if (shardWithIndex == null) {
        return inChannel;
    }
    inChannel = initializeBloomFilterAndIndexPerShard(inChannel);
    // If the index has been populated and contains the shard id, we can return.
    if (indexPerShard != null && indexPerShard.containsKey(shardId)) {
        checkState(bloomFilter != null, "Bloom filter expected to have been initialized.");
        return inChannel;
    }
    checkState(indexPerShard.get(shardId) == null, "Expected to not have initialized index for shard %s", shardId);
    Long startOfNextBlock = shardOffsetToShardMap.higherKey(shardWithIndex.getBlockOffset());
    // as the upper bound.
    if (startOfNextBlock == null) {
        startOfNextBlock = footer.getBloomFilterPosition();
    }
    // Open the channel if needed and seek to the start of the index.
    SeekableByteChannel rawChannel = openIfNeeded(inChannel);
    rawChannel.position(shardWithIndex.getIndexOffset());
    InputStream inStream = Channels.newInputStream(rawChannel);
    ImmutableSortedMap.Builder<RandomAccessData, IsmShardKey> builder = ImmutableSortedMap.orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR);
    // Read the first key
    RandomAccessData currentKeyBytes = new RandomAccessData();
    readKey(inStream, currentKeyBytes);
    long currentOffset = VarInt.decodeLong(inStream);
    // Insert the entry that happens at the beginning limiting the shard block by the
    // first keys block offset.
    builder.put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), shardWithIndex.getBlockOffset(), currentOffset));
    // that limit the range of the shard block.
    while (rawChannel.position() < startOfNextBlock) {
        RandomAccessData nextKeyBytes = currentKeyBytes.copy();
        readKey(inStream, nextKeyBytes);
        long nextOffset = VarInt.decodeLong(inStream);
        builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, nextOffset));
        currentKeyBytes = nextKeyBytes;
        currentOffset = nextOffset;
    }
    // Upper bound the last entry with the index offset.
    builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, shardWithIndex.getIndexOffset()));
    indexPerShard.put(shardId, builder.build());
    return Optional.of(rawChannel);
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) InputStream(java.io.InputStream) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) ImmutableSortedMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap) IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings)

Example 10 with RandomAccessData

use of org.apache.beam.runners.dataflow.util.RandomAccessData in project beam by apache.

the class IsmSideInputReaderTest method initInputFile.

/**
 * Write input elements to the given file and return the corresponding IsmSource.
 */
private <K, V> Source initInputFile(Iterable<IsmRecord<WindowedValue<V>>> elements, IsmRecordCoder<WindowedValue<V>> coder, String tmpFilePath) throws Exception {
    // Group the keys by shard and sort the values within a shard by the composite key.
    Map<Integer, SortedMap<RandomAccessData, IsmRecord<WindowedValue<V>>>> writeOrder = new HashMap<>();
    for (IsmRecord<WindowedValue<V>> element : elements) {
        int shardId = coder.hash(element.getKeyComponents());
        if (!writeOrder.containsKey(shardId)) {
            writeOrder.put(shardId, new TreeMap<RandomAccessData, IsmRecord<WindowedValue<V>>>(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR));
        }
        RandomAccessData data = encodeKeyPortion(coder, element);
        writeOrder.get(shardId).put(data, element);
    }
    IsmSink<WindowedValue<V>> sink = new IsmSink<>(FileSystems.matchNewResource(tmpFilePath, false), coder, BLOOM_FILTER_SIZE_LIMIT);
    try (SinkWriter<WindowedValue<IsmRecord<WindowedValue<V>>>> writer = sink.writer()) {
        for (Entry<Integer, SortedMap<RandomAccessData, IsmRecord<WindowedValue<V>>>> entry : writeOrder.entrySet()) {
            for (IsmRecord<WindowedValue<V>> record : entry.getValue().values()) {
                writer.add(new ValueInEmptyWindows<>(record));
            }
        }
    }
    return newIsmSource(coder, tmpFilePath);
}
Also used : HashMap(java.util.HashMap) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) SortedMap(java.util.SortedMap)

Aggregations

RandomAccessData (org.apache.beam.runners.dataflow.util.RandomAccessData)12 SeekableByteChannel (java.nio.channels.SeekableByteChannel)5 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)5 File (java.io.File)3 ArrayList (java.util.ArrayList)3 IsmShard (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)3 WindowedValue (org.apache.beam.sdk.util.WindowedValue)3 Test (org.junit.Test)3 Closeable (java.io.Closeable)2 HashMap (java.util.HashMap)2 NavigableMap (java.util.NavigableMap)2 SortedMap (java.util.SortedMap)2 IsmShardKey (org.apache.beam.runners.dataflow.worker.IsmReaderImpl.IsmShardKey)2 WeightedValue (org.apache.beam.sdk.util.WeightedValue)2 KV (org.apache.beam.sdk.values.KV)2 ImmutableSortedMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap)2 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 InputStream (java.io.InputStream)1 Map (java.util.Map)1 IsmRecordCoder (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder)1