Search in sources :

Example 1 with IsmShard

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.

the class IsmReaderImpl method initializeBloomFilterAndIndexPerShard.

/**
 * Initializes the Bloom filter and index per shard. We prepopulate empty indices for shards where
 * the index offset matches the following shard block offset. Re-uses the provided channel,
 * returning it or a new one if this method was required to open one.
 */
private synchronized Optional<SeekableByteChannel> initializeBloomFilterAndIndexPerShard(Optional<SeekableByteChannel> inChannel) throws IOException {
    if (indexPerShard != null) {
        checkState(bloomFilter != null, "Expected Bloom filter to have been initialized.");
        return inChannel;
    }
    SeekableByteChannel rawChannel = openIfNeeded(inChannel);
    // Set the position to where the bloom filter is and read it in.
    position(rawChannel, footer.getBloomFilterPosition());
    bloomFilter = ScalableBloomFilterCoder.of().decode(Channels.newInputStream(rawChannel));
    indexPerShard = new HashMap<>();
    // If a shard is small, it may not contain an index and we can detect this and
    // prepopulate the shard index map with an empty entry if the start of the index
    // and start of the next block are equal
    Iterator<IsmShard> shardIterator = shardOffsetToShardMap.values().iterator();
    // If file is empty we just return here.
    if (!shardIterator.hasNext()) {
        return Optional.of(rawChannel);
    }
    // If the current shard's index position is equal to the next shards block offset
    // then we know that the index contains no data and we can pre-populate it with
    // the empty map.
    IsmShard currentShard = shardIterator.next();
    while (shardIterator.hasNext()) {
        IsmShard nextShard = shardIterator.next();
        if (currentShard.getIndexOffset() == nextShard.getBlockOffset()) {
            indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
        }
        currentShard = nextShard;
    }
    // start of the Bloom filter, then we know that the index is empty.
    if (currentShard.getIndexOffset() == footer.getBloomFilterPosition()) {
        indexPerShard.put(currentShard.getId(), ImmutableSortedMap.<RandomAccessData, IsmShardKey>orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR).put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), currentShard.getBlockOffset(), currentShard.getIndexOffset())).build());
    }
    return Optional.of(rawChannel);
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)

Example 2 with IsmShard

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.

the class IsmReaderImpl method initializeFooterAndShardIndex.

/**
 * Initialize this Ism reader by reading the footer and shard index. Returns a channel for re-use
 * if this method was required to open one.
 */
private synchronized Optional<SeekableByteChannel> initializeFooterAndShardIndex(Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
    if (footer != null) {
        checkState(shardIdToShardMap != null, "Expected shard id to shard map to have been initialized.");
        checkState(shardOffsetToShardMap != null, "Expected shard offset to shard map to have been initialized.");
        return inChannel;
    }
    checkState(shardIdToShardMap == null, "Expected shard id to shard map to not have been initialized.");
    checkState(shardOffsetToShardMap == null, "Expected shard offset to shard map to not have been initialized.");
    SeekableByteChannel rawChannel;
    RandomAccessData data;
    long startPosition;
    try (Closeable closeReadCounter = readCounter.enter()) {
        rawChannel = openIfNeeded(inChannel);
        this.length = rawChannel.size();
        // We read the last chunk of data, for small files we will capture the entire file.
        // We may capture the Bloom filter, shard index, and footer for slightly larger files.
        // Otherwise we are guaranteed to capture the footer and the shard index.
        startPosition = Math.max(length - MAX_SHARD_INDEX_AND_FOOTER_SIZE, 0);
        position(rawChannel, startPosition);
        data = new RandomAccessData(ByteStreams.toByteArray(Channels.newInputStream(rawChannel)));
    }
    readCounter.addBytesRead(data.size());
    // Read the fixed length footer.
    this.footer = FooterCoder.of().decode(data.asInputStream(data.size() - Footer.FIXED_LENGTH, Footer.FIXED_LENGTH));
    checkState(startPosition < footer.getIndexPosition(), "Malformed file, expected to have been able to read entire shard index.");
    int offsetWithinReadData = (int) (footer.getIndexPosition() - startPosition);
    // Decode the list of Ism shard descriptors
    List<IsmShard> ismShards = IsmFormat.ISM_SHARD_INDEX_CODER.decode(data.asInputStream(offsetWithinReadData, data.size() - offsetWithinReadData));
    // Build the shard id to shard descriptor map
    ImmutableSortedMap.Builder<Integer, IsmShard> shardIdToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Integer>natural());
    for (IsmShard ismShard : ismShards) {
        shardIdToShardMapBuilder.put(ismShard.getId(), ismShard);
    }
    shardIdToShardMap = shardIdToShardMapBuilder.build();
    // Build the shard block offset to shard descriptor map
    ImmutableSortedMap.Builder<Long, IsmShard> shardOffsetToShardMapBuilder = ImmutableSortedMap.orderedBy(Ordering.<Long>natural());
    for (IsmShard ismShard : ismShards) {
        shardOffsetToShardMapBuilder.put(ismShard.getBlockOffset(), ismShard);
    }
    shardOffsetToShardMap = shardOffsetToShardMapBuilder.build();
    // We may have gotten the Bloom filter, if so lets store it.
    if (startPosition < footer.getBloomFilterPosition()) {
        Optional<SeekableByteChannel> cachedDataChannel = Optional.<SeekableByteChannel>of(new CachedTailSeekableByteChannel(startPosition, data.array()));
        initializeBloomFilterAndIndexPerShard(cachedDataChannel);
        // case since the IsmSideInputReader does initialization in parallel.
        if (cache != null && startPosition == 0) {
            for (IsmShard ismShard : ismShards) {
                initializeForKeyedRead(ismShard.getId(), cachedDataChannel, readCounter);
            }
            for (SortedMap<RandomAccessData, IsmShardKey> shards : indexPerShard.values()) {
                for (Map.Entry<RandomAccessData, IsmShardKey> block : shards.entrySet()) {
                    cache.put(block.getValue(), new IsmCacheLoader(block.getValue()).call(cachedDataChannel.get()));
                }
            }
        }
    }
    return Optional.of(rawChannel);
}
Also used : RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) Closeable(java.io.Closeable) ImmutableSortedMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap) SeekableByteChannel(java.nio.channels.SeekableByteChannel) Map(java.util.Map) NavigableMap(java.util.NavigableMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) ImmutableSortedMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap) IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)

Example 3 with IsmShard

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.

the class IsmReaderImpl method initializeForKeyedRead.

/**
 * Initializes the footer, shard index, Bloom filter and index for the requested shard id if they
 * have not been initialized yet. Re-uses the provided channel, returning it or a new one if this
 * method was required to open one.
 */
// Real bug - https://issues.apache.org/jira/browse/BEAM-6559
@SuppressFBWarnings("NP_NULL_ON_SOME_PATH")
private Optional<SeekableByteChannel> initializeForKeyedRead(int shardId, Optional<SeekableByteChannel> inChannel, SideInputReadCounter readCounter) throws IOException {
    inChannel = initializeFooterAndShardIndex(inChannel, readCounter);
    IsmShard shardWithIndex = shardIdToShardMap.get(shardId);
    // If this shard id is not within this file, we can return immediately.
    if (shardWithIndex == null) {
        return inChannel;
    }
    inChannel = initializeBloomFilterAndIndexPerShard(inChannel);
    // If the index has been populated and contains the shard id, we can return.
    if (indexPerShard != null && indexPerShard.containsKey(shardId)) {
        checkState(bloomFilter != null, "Bloom filter expected to have been initialized.");
        return inChannel;
    }
    checkState(indexPerShard.get(shardId) == null, "Expected to not have initialized index for shard %s", shardId);
    Long startOfNextBlock = shardOffsetToShardMap.higherKey(shardWithIndex.getBlockOffset());
    // as the upper bound.
    if (startOfNextBlock == null) {
        startOfNextBlock = footer.getBloomFilterPosition();
    }
    // Open the channel if needed and seek to the start of the index.
    SeekableByteChannel rawChannel = openIfNeeded(inChannel);
    rawChannel.position(shardWithIndex.getIndexOffset());
    InputStream inStream = Channels.newInputStream(rawChannel);
    ImmutableSortedMap.Builder<RandomAccessData, IsmShardKey> builder = ImmutableSortedMap.orderedBy(RandomAccessData.UNSIGNED_LEXICOGRAPHICAL_COMPARATOR);
    // Read the first key
    RandomAccessData currentKeyBytes = new RandomAccessData();
    readKey(inStream, currentKeyBytes);
    long currentOffset = VarInt.decodeLong(inStream);
    // Insert the entry that happens at the beginning limiting the shard block by the
    // first keys block offset.
    builder.put(new RandomAccessData(0), new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), new RandomAccessData(0), shardWithIndex.getBlockOffset(), currentOffset));
    // that limit the range of the shard block.
    while (rawChannel.position() < startOfNextBlock) {
        RandomAccessData nextKeyBytes = currentKeyBytes.copy();
        readKey(inStream, nextKeyBytes);
        long nextOffset = VarInt.decodeLong(inStream);
        builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, nextOffset));
        currentKeyBytes = nextKeyBytes;
        currentOffset = nextOffset;
    }
    // Upper bound the last entry with the index offset.
    builder.put(currentKeyBytes, new IsmShardKey(IsmReaderImpl.this.resourceId.toString(), currentKeyBytes, currentOffset, shardWithIndex.getIndexOffset()));
    indexPerShard.put(shardId, builder.build());
    return Optional.of(rawChannel);
}
Also used : SeekableByteChannel(java.nio.channels.SeekableByteChannel) InputStream(java.io.InputStream) RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) ImmutableSortedMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap) IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings)

Example 4 with IsmShard

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.

the class IsmFormatTest method testIsmShardToStringEqualsAndHashCode.

@Test
public void testIsmShardToStringEqualsAndHashCode() {
    IsmShard shardA = IsmShard.of(1, 2, 3);
    IsmShard shardB = IsmShard.of(1, 2, 3);
    IsmShard shardC = IsmShard.of(4, 5, 6);
    assertEquals(shardA, shardB);
    assertNotEquals(shardA, shardC);
    assertEquals(shardA.hashCode(), shardB.hashCode());
    assertNotEquals(shardA.hashCode(), shardC.hashCode());
    assertThat(shardA.toString(), allOf(containsString("id=1"), containsString("blockOffset=2"), containsString("indexOffset=3")));
}
Also used : IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard) Test(org.junit.Test)

Example 5 with IsmShard

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard in project beam by apache.

the class IsmFormatTest method testIsmShardCoder.

@Test
public void testIsmShardCoder() throws Exception {
    IsmShard shardA = IsmShard.of(1, 2, 3);
    IsmShard shardB = IsmShard.of(1, 2, 3);
    CoderProperties.coderDecodeEncodeEqual(IsmShardCoder.of(), shardA);
    CoderProperties.coderDeterministic(IsmShardCoder.of(), shardA, shardB);
    CoderProperties.coderConsistentWithEquals(IsmShardCoder.of(), shardA, shardB);
    CoderProperties.coderSerializable(IsmShardCoder.of());
    CoderProperties.structuralValueConsistentWithEquals(IsmShardCoder.of(), shardA, shardB);
}
Also used : IsmShard(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard) Test(org.junit.Test)

Aggregations

IsmShard (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmShard)5 SeekableByteChannel (java.nio.channels.SeekableByteChannel)3 RandomAccessData (org.apache.beam.runners.dataflow.util.RandomAccessData)3 ImmutableSortedMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSortedMap)2 Test (org.junit.Test)2 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 Closeable (java.io.Closeable)1 InputStream (java.io.InputStream)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 NavigableMap (java.util.NavigableMap)1 SortedMap (java.util.SortedMap)1