use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.
the class IsmReaderTest method dataGeneratorPerShard.
/**
* Creates a map from Ism shard to a sorted set of IsmRecords.
*/
private Map<Integer, SortedSet<IsmRecord<byte[]>>> dataGeneratorPerShard(final int numberOfPrimaryKeys, final int minNumberOfSecondaryKeys, final int maxKeySize, final int maxValueSize) {
checkState(maxKeySize >= MIN_KEY_SIZE);
final Random random = new Random(minNumberOfSecondaryKeys);
Map<Integer, SortedSet<IsmRecord<byte[]>>> shardToRecordMap = new HashMap<>();
while (shardToRecordMap.keySet().size() < numberOfPrimaryKeys) {
// Generate the next primary key
byte[] primaryKey = new byte[random.nextInt(maxKeySize - MIN_KEY_SIZE) + MIN_KEY_SIZE];
random.nextBytes(primaryKey);
int shardId = CODER.hash(ImmutableList.of(primaryKey));
// Add a sorted set for the shard id if this shard id has never been generated before.
if (!shardToRecordMap.containsKey(shardId)) {
shardToRecordMap.put(shardId, new TreeSet<IsmRecord<byte[]>>(new IsmRecordKeyComparator<byte[]>(CODER)));
}
// Generate the requested number of secondary keys using the newly generated primary key.
byte[] secondaryKey = new byte[maxKeySize];
for (int j = 0; j < minNumberOfSecondaryKeys; ++j) {
secondaryKey = generateNextSecondaryKey(random, maxKeySize, secondaryKey);
// Generate the value bytes.
byte[] value = new byte[random.nextInt(maxValueSize)];
random.nextBytes(value);
// 1% of keys are metadata records
if (random.nextFloat() < PERCENT_METADATA_RECORDS) {
IsmRecord<byte[]> ismRecord = IsmRecord.meta(ImmutableList.of(IsmFormat.getMetadataKey(), secondaryKey), value);
int metadataShardId = CODER.hash(ismRecord.getKeyComponents());
// Add a sorted set for the shard id if this shard id has never been generated before.
if (!shardToRecordMap.containsKey(metadataShardId)) {
shardToRecordMap.put(metadataShardId, new TreeSet<IsmRecord<byte[]>>(new IsmRecordKeyComparator<byte[]>(CODER)));
}
shardToRecordMap.get(metadataShardId).add(ismRecord);
} else {
IsmRecord<byte[]> ismRecord = IsmRecord.<byte[]>of(ImmutableList.of(primaryKey, secondaryKey), value);
shardToRecordMap.get(shardId).add(ismRecord);
}
}
}
return shardToRecordMap;
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.
the class IsmReaderTest method testReadMissingKeys.
@Test
public void testReadMissingKeys() throws Exception {
File tmpFile = tmpFolder.newFile();
List<IsmRecord<byte[]>> data = new ArrayList<>();
data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x04 }), EMPTY));
data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] { 0x08 }), EMPTY));
writeElementsToFile(data, tmpFile);
IsmReader<byte[]> reader = new IsmReaderImpl<byte[]>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
// Check that we got false with a key before all keys contained in the file.
assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] { 0x02 })).start());
// Check that we got false with a key between two other keys contained in the file.
assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] { 0x06 })).start());
// Check that we got false with a key that is after all keys contained in the file.
assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] { 0x10 })).start());
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.
the class IsmReaderTest method writeElementsToFileAndFindLastElementPerPrimaryKey.
private void writeElementsToFileAndFindLastElementPerPrimaryKey(Iterable<IsmRecord<byte[]>> elements) throws Exception {
File tmpFile = tmpFolder.newFile();
Iterable<IsmRecord<byte[]>> oddValues = Iterables.filter(elements, Predicates.not(EvenFilter.INSTANCE));
Iterable<IsmRecord<byte[]>> evenValues = Iterables.filter(elements, EvenFilter.INSTANCE);
writeElementsToFile(oddValues, tmpFile);
IsmReader<byte[]> reader = new IsmReaderImpl<>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
SortedMap<byte[], NavigableSet<IsmRecord<byte[]>>> sortedBySecondKey = new TreeMap<>(UnsignedBytes.lexicographicalComparator());
for (IsmRecord<byte[]> element : oddValues) {
byte[] encodedPrimaryKey = CoderUtils.encodeToByteArray(CODER.getKeyComponentCoder(0), element.getKeyComponent(0));
if (!sortedBySecondKey.containsKey(encodedPrimaryKey)) {
sortedBySecondKey.put(encodedPrimaryKey, new TreeSet<>(new IsmRecordKeyComparator<>(CODER)));
}
sortedBySecondKey.get(encodedPrimaryKey).add(element);
}
// The returned value should have the element as a prefix of itself.
for (IsmRecord<byte[]> element : oddValues) {
byte[] encodedPrimaryKey = CoderUtils.encodeToByteArray(CODER.getKeyComponentCoder(0), element.getKeyComponent(0));
assertIsmEquals(reader.overKeyComponents(ImmutableList.of(element.getKeyComponent(0))).getLast().getValue(), sortedBySecondKey.get(encodedPrimaryKey).last());
}
// The returned value should always have the element as a prefix of itself or not exist.
for (IsmRecord<byte[]> element : evenValues) {
byte[] encodedPrimaryKey = CoderUtils.encodeToByteArray(CODER.getKeyComponentCoder(0), element.getKeyComponent(0));
IsmReader<byte[]>.IsmPrefixReaderIterator readerIterator = reader.overKeyComponents(ImmutableList.of(element.getKeyComponent(0)));
WindowedValue<IsmRecord<byte[]>> lastWindowedValue = readerIterator.getLast();
if (lastWindowedValue != null) {
assertIsmEquals(lastWindowedValue.getValue(), sortedBySecondKey.get(encodedPrimaryKey).last());
}
}
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.
the class IsmReaderTest method writeElementsToFileAndReadInRandomOrder.
/**
* Writes elements to an Ism file using an IsmSink. Then reads them back with an IsmReader using a
* random order.
*/
private void writeElementsToFileAndReadInRandomOrder(Iterable<IsmRecord<byte[]>> elements) throws Exception {
File tmpFile = tmpFolder.newFile();
List<IsmRecord<byte[]>> oddSecondaryKeys = new ArrayList<>(ImmutableList.copyOf(Iterables.filter(elements, Predicates.not(EvenFilter.INSTANCE))));
List<IsmRecord<byte[]>> evenSecondaryKeys = new ArrayList<>(ImmutableList.copyOf(Iterables.filter(elements, EvenFilter.INSTANCE)));
writeElementsToFile(oddSecondaryKeys, tmpFile);
IsmReader<byte[]> reader = new IsmReaderImpl<>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
// Test using next() for a within shard Ism prefix reader iterator
Collections.shuffle(oddSecondaryKeys);
for (IsmRecord<byte[]> expectedNext : oddSecondaryKeys) {
IsmReader<byte[]>.IsmPrefixReaderIterator iterator = reader.overKeyComponents(expectedNext.getKeyComponents());
assertTrue(iterator.start());
assertIsmEquals(iterator.getCurrent().getValue(), expectedNext);
}
Collections.shuffle(oddSecondaryKeys);
// Test using get() for a shard aware Ism prefix reader
IsmReader<byte[]>.IsmPrefixReaderIterator readerIterator = reader.overKeyComponents(ImmutableList.of());
for (IsmRecord<byte[]> expectedNext : oddSecondaryKeys) {
assertIsmEquals(readerIterator.get(expectedNext.getKeyComponents()).getValue(), expectedNext);
}
// Test using next() for a within shard Ism prefix reader iterator
Collections.shuffle(evenSecondaryKeys);
for (IsmRecord<byte[]> missingNext : evenSecondaryKeys) {
assertFalse(reader.overKeyComponents(missingNext.getKeyComponents()).start());
}
Collections.shuffle(evenSecondaryKeys);
// Test using get() for a shard aware Ism prefix reader
readerIterator = reader.overKeyComponents(ImmutableList.of());
for (IsmRecord<byte[]> missingNext : evenSecondaryKeys) {
assertNull(readerIterator.get(missingNext.getKeyComponents()));
}
}
use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.
the class IsmReaderTest method writeElementsToFileAndReadInOrder.
/**
* Writes elements to an Ism file using an IsmSink. Then reads them back with an IsmReader,
* verifying the values read match those that were written.
*/
private void writeElementsToFileAndReadInOrder(Iterable<IsmRecord<byte[]>> elements) throws Exception {
File tmpFile = tmpFolder.newFile();
writeElementsToFile(elements, tmpFile);
IsmReader<byte[]> reader = new IsmReaderImpl<>(FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);
assertFalse(reader.isInitialized());
TestReaderObserver observer = new TestReaderObserver(reader);
reader.addObserver(observer);
Iterator<IsmRecord<byte[]>> elementsIterator = elements.iterator();
try (NativeReader.NativeReaderIterator<WindowedValue<IsmRecord<byte[]>>> iterator = reader.iterator()) {
boolean more = iterator.start();
assertTrue(reader.isInitialized());
for (; more; more = iterator.advance()) {
if (!elementsIterator.hasNext()) {
break;
}
IsmRecord<byte[]> expected = elementsIterator.next();
IsmRecord<byte[]> actual = iterator.getCurrent().getValue();
assertIsmEquals(actual, expected);
final int expectedLength;
if (IsmFormat.isMetadataKey(expected.getKeyComponents())) {
expectedLength = expected.getMetadata().length;
} else {
expectedLength = expected.getValue().length;
}
// Verify that the observer saw at least as many bytes as the size of the value.
assertTrue(expectedLength <= observer.getActualSizes().get(observer.getActualSizes().size() - 1));
}
if (iterator.advance()) {
fail("Read more elements then expected, did not expect: " + iterator.getCurrent());
} else if (elementsIterator.hasNext()) {
fail("Read less elements then expected, expected: " + elementsIterator.next());
}
// Verify that we see a {@link NoSuchElementException} if we attempt to go further.
try {
iterator.getCurrent();
fail("Expected a NoSuchElementException to have been thrown.");
} catch (NoSuchElementException expected) {
}
}
}
Aggregations