Search in sources :

Example 21 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmReaderTest method dataGeneratorPerShard.

/**
 * Creates a map from Ism shard to a sorted set of IsmRecords.
 */
private Map<Integer, SortedSet<IsmRecord<byte[]>>> dataGeneratorPerShard(final int numberOfPrimaryKeys, final int minNumberOfSecondaryKeys, final int maxKeySize, final int maxValueSize) {
    checkState(maxKeySize >= MIN_KEY_SIZE);
    final Random random = new Random(minNumberOfSecondaryKeys);
    Map<Integer, SortedSet<IsmRecord<byte[]>>> shardToRecordMap = new HashMap<>();
    while (shardToRecordMap.keySet().size() < numberOfPrimaryKeys) {
        // Generate the next primary key
        byte[] primaryKey = new byte[random.nextInt(maxKeySize - MIN_KEY_SIZE) + MIN_KEY_SIZE];
        random.nextBytes(primaryKey);
        int shardId = CODER.hash(ImmutableList.of(primaryKey));
        // Add a sorted set for the shard id if this shard id has never been generated before.
        if (!shardToRecordMap.containsKey(shardId)) {
            shardToRecordMap.put(shardId, new TreeSet<IsmRecord<byte[]>>(new IsmRecordKeyComparator<byte[]>(CODER)));
        }
        // Generate the requested number of secondary keys using the newly generated primary key.
        byte[] secondaryKey = new byte[maxKeySize];
        for (int j = 0; j < minNumberOfSecondaryKeys; ++j) {
            secondaryKey = generateNextSecondaryKey(random, maxKeySize, secondaryKey);
            // Generate the value bytes.
            byte[] value = new byte[random.nextInt(maxValueSize)];
            random.nextBytes(value);
            // 1% of keys are metadata records
            if (random.nextFloat() < PERCENT_METADATA_RECORDS) {
                IsmRecord<byte[]> ismRecord = IsmRecord.meta(ImmutableList.of(IsmFormat.getMetadataKey(), secondaryKey), value);
                int metadataShardId = CODER.hash(ismRecord.getKeyComponents());
                // Add a sorted set for the shard id if this shard id has never been generated before.
                if (!shardToRecordMap.containsKey(metadataShardId)) {
                    shardToRecordMap.put(metadataShardId, new TreeSet<IsmRecord<byte[]>>(new IsmRecordKeyComparator<byte[]>(CODER)));
                }
                shardToRecordMap.get(metadataShardId).add(ismRecord);
            } else {
                IsmRecord<byte[]> ismRecord = IsmRecord.<byte[]>of(ImmutableList.of(primaryKey, secondaryKey), value);
                shardToRecordMap.get(shardId).add(ismRecord);
            }
        }
    }
    return shardToRecordMap;
}
Also used : Random(java.util.Random) HashMap(java.util.HashMap) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) SortedSet(java.util.SortedSet)

Example 22 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmReaderTest method testReadMissingKeys.

@Test
public void testReadMissingKeys() throws Exception {
    File tmpFile = tmpFolder.newFile();
    List<IsmRecord<byte[]>> data = new ArrayList<>();
    data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x04 }), EMPTY));
    data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x08 }), EMPTY));
    writeElementsToFile(data, tmpFile);
    IsmReader<byte[]> reader = new IsmReaderImpl<byte[]>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
    // Check that we got false with a key before all keys contained in the file.
    assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] { 0x02 })).start());
    // Check that we got false with a key between two other keys contained in the file.
    assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] { 0x06 })).start());
    // Check that we got false with a key that is after all keys contained in the file.
    assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] { 0x10 })).start());
}
Also used : ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) File(java.io.File) Test(org.junit.Test)

Example 23 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmReaderTest method writeElementsToFileAndFindLastElementPerPrimaryKey.

private void writeElementsToFileAndFindLastElementPerPrimaryKey(Iterable<IsmRecord<byte[]>> elements) throws Exception {
    File tmpFile = tmpFolder.newFile();
    Iterable<IsmRecord<byte[]>> oddValues = Iterables.filter(elements, Predicates.not(EvenFilter.INSTANCE));
    Iterable<IsmRecord<byte[]>> evenValues = Iterables.filter(elements, EvenFilter.INSTANCE);
    writeElementsToFile(oddValues, tmpFile);
    IsmReader<byte[]> reader = new IsmReaderImpl<>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
    SortedMap<byte[], NavigableSet<IsmRecord<byte[]>>> sortedBySecondKey = new TreeMap<>(UnsignedBytes.lexicographicalComparator());
    for (IsmRecord<byte[]> element : oddValues) {
        byte[] encodedPrimaryKey = CoderUtils.encodeToByteArray(CODER.getKeyComponentCoder(0), element.getKeyComponent(0));
        if (!sortedBySecondKey.containsKey(encodedPrimaryKey)) {
            sortedBySecondKey.put(encodedPrimaryKey, new TreeSet<>(new IsmRecordKeyComparator<>(CODER)));
        }
        sortedBySecondKey.get(encodedPrimaryKey).add(element);
    }
    // The returned value should have the element as a prefix of itself.
    for (IsmRecord<byte[]> element : oddValues) {
        byte[] encodedPrimaryKey = CoderUtils.encodeToByteArray(CODER.getKeyComponentCoder(0), element.getKeyComponent(0));
        assertIsmEquals(reader.overKeyComponents(ImmutableList.of(element.getKeyComponent(0))).getLast().getValue(), sortedBySecondKey.get(encodedPrimaryKey).last());
    }
    // The returned value should always have the element as a prefix of itself or not exist.
    for (IsmRecord<byte[]> element : evenValues) {
        byte[] encodedPrimaryKey = CoderUtils.encodeToByteArray(CODER.getKeyComponentCoder(0), element.getKeyComponent(0));
        IsmReader<byte[]>.IsmPrefixReaderIterator readerIterator = reader.overKeyComponents(ImmutableList.of(element.getKeyComponent(0)));
        WindowedValue<IsmRecord<byte[]>> lastWindowedValue = readerIterator.getLast();
        if (lastWindowedValue != null) {
            assertIsmEquals(lastWindowedValue.getValue(), sortedBySecondKey.get(encodedPrimaryKey).last());
        }
    }
}
Also used : NavigableSet(java.util.NavigableSet) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) TreeMap(java.util.TreeMap) File(java.io.File)

Example 24 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmReaderTest method writeElementsToFileAndReadInRandomOrder.

/**
 * Writes elements to an Ism file using an IsmSink. Then reads them back with an IsmReader using a
 * random order.
 */
private void writeElementsToFileAndReadInRandomOrder(Iterable<IsmRecord<byte[]>> elements) throws Exception {
    File tmpFile = tmpFolder.newFile();
    List<IsmRecord<byte[]>> oddSecondaryKeys = new ArrayList<>(ImmutableList.copyOf(Iterables.filter(elements, Predicates.not(EvenFilter.INSTANCE))));
    List<IsmRecord<byte[]>> evenSecondaryKeys = new ArrayList<>(ImmutableList.copyOf(Iterables.filter(elements, EvenFilter.INSTANCE)));
    writeElementsToFile(oddSecondaryKeys, tmpFile);
    IsmReader<byte[]> reader = new IsmReaderImpl<>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
    // Test using next() for a within shard Ism prefix reader iterator
    Collections.shuffle(oddSecondaryKeys);
    for (IsmRecord<byte[]> expectedNext : oddSecondaryKeys) {
        IsmReader<byte[]>.IsmPrefixReaderIterator iterator = reader.overKeyComponents(expectedNext.getKeyComponents());
        assertTrue(iterator.start());
        assertIsmEquals(iterator.getCurrent().getValue(), expectedNext);
    }
    Collections.shuffle(oddSecondaryKeys);
    // Test using get() for a shard aware Ism prefix reader
    IsmReader<byte[]>.IsmPrefixReaderIterator readerIterator = reader.overKeyComponents(ImmutableList.of());
    for (IsmRecord<byte[]> expectedNext : oddSecondaryKeys) {
        assertIsmEquals(readerIterator.get(expectedNext.getKeyComponents()).getValue(), expectedNext);
    }
    // Test using next() for a within shard Ism prefix reader iterator
    Collections.shuffle(evenSecondaryKeys);
    for (IsmRecord<byte[]> missingNext : evenSecondaryKeys) {
        assertFalse(reader.overKeyComponents(missingNext.getKeyComponents()).start());
    }
    Collections.shuffle(evenSecondaryKeys);
    // Test using get() for a shard aware Ism prefix reader
    readerIterator = reader.overKeyComponents(ImmutableList.of());
    for (IsmRecord<byte[]> missingNext : evenSecondaryKeys) {
        assertNull(readerIterator.get(missingNext.getKeyComponents()));
    }
}
Also used : ArrayList(java.util.ArrayList) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) File(java.io.File)

Example 25 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmReaderTest method writeElementsToFileAndReadInOrder.

/**
 * Writes elements to an Ism file using an IsmSink. Then reads them back with an IsmReader,
 * verifying the values read match those that were written.
 */
private void writeElementsToFileAndReadInOrder(Iterable<IsmRecord<byte[]>> elements) throws Exception {
    File tmpFile = tmpFolder.newFile();
    writeElementsToFile(elements, tmpFile);
    IsmReader<byte[]> reader = new IsmReaderImpl<>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
    assertFalse(reader.isInitialized());
    TestReaderObserver observer = new TestReaderObserver(reader);
    reader.addObserver(observer);
    Iterator<IsmRecord<byte[]>> elementsIterator = elements.iterator();
    try (NativeReader.NativeReaderIterator<WindowedValue<IsmRecord<byte[]>>> iterator = reader.iterator()) {
        boolean more = iterator.start();
        assertTrue(reader.isInitialized());
        for (; more; more = iterator.advance()) {
            if (!elementsIterator.hasNext()) {
                break;
            }
            IsmRecord<byte[]> expected = elementsIterator.next();
            IsmRecord<byte[]> actual = iterator.getCurrent().getValue();
            assertIsmEquals(actual, expected);
            final int expectedLength;
            if (IsmFormat.isMetadataKey(expected.getKeyComponents())) {
                expectedLength = expected.getMetadata().length;
            } else {
                expectedLength = expected.getValue().length;
            }
            // Verify that the observer saw at least as many bytes as the size of the value.
            assertTrue(expectedLength <= observer.getActualSizes().get(observer.getActualSizes().size() - 1));
        }
        if (iterator.advance()) {
            fail("Read more elements then expected, did not expect: " + iterator.getCurrent());
        } else if (elementsIterator.hasNext()) {
            fail("Read less elements then expected, expected: " + elementsIterator.next());
        }
        // Verify that we see a {@link NoSuchElementException} if we attempt to go further.
        try {
            iterator.getCurrent();
            fail("Expected a NoSuchElementException to have been thrown.");
        } catch (NoSuchElementException expected) {
        }
    }
}
Also used : TestReaderObserver(org.apache.beam.runners.dataflow.worker.util.common.worker.ExecutorTestUtils.TestReaderObserver) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) NativeReader(org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader) WindowedValue(org.apache.beam.sdk.util.WindowedValue) File(java.io.File) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)26 Test (org.junit.Test)17 WindowedValue (org.apache.beam.sdk.util.WindowedValue)16 ArrayList (java.util.ArrayList)12 File (java.io.File)8 KV (org.apache.beam.sdk.values.KV)8 HashMap (java.util.HashMap)7 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)7 Instant (org.joda.time.Instant)7 SortedMap (java.util.SortedMap)6 TreeMap (java.util.TreeMap)6 Callable (java.util.concurrent.Callable)6 Future (java.util.concurrent.Future)6 Source (com.google.api.services.dataflow.model.Source)5 Collection (java.util.Collection)5 Map (java.util.Map)5 RandomAccessData (org.apache.beam.runners.dataflow.util.RandomAccessData)5 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)5 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)3 TransformedMap (org.apache.beam.runners.dataflow.BatchViewOverrides.TransformedMap)2