use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class TestExternalSpillableMap method testDataCorrectnessWithUpsertsToDataInMapAndOnDisk.
@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
isCompressionEnabled);
List<String> recordKeys = new ArrayList<>();
// Ensure we spill to disk
while (records.getDiskBasedMapNumEntries() < 1) {
List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
recordKeys.addAll(SpillableMapTestUtils.upsertRecords(iRecords, records));
}
// Get a record from the in-Memory map
String key = recordKeys.get(0);
HoodieAvroRecord record = (HoodieAvroRecord) records.get(key);
List<IndexedRecord> recordsToUpdate = new ArrayList<>();
recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
List<String> keysToBeUpdated = new ArrayList<>();
keysToBeUpdated.add(key);
// Update the instantTime for this record
List<IndexedRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
// Upsert this updated record
SpillableMapTestUtils.upsertRecords(updatedRecords, records);
GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated commitTime
assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
// Get a record from the disk based map
key = recordKeys.get(recordKeys.size() - 1);
record = (HoodieAvroRecord) records.get(key);
recordsToUpdate = new ArrayList<>();
recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
keysToBeUpdated = new ArrayList<>();
keysToBeUpdated.add(key);
// Update the commitTime for this record
updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
// Upsert this updated record
SpillableMapTestUtils.upsertRecords(updatedRecords, records);
gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated instantTime
assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class TestExternalSpillableMap method simpleInsertTest.
@ParameterizedTest
@MethodSource("testArguments")
public void simpleInsertTest(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
isCompressionEnabled);
List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
assert (recordKeys.size() == 100);
// Test iterator
Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
int cntSize = 0;
while (itr.hasNext()) {
HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
cntSize++;
assert recordKeys.contains(rec.getRecordKey());
}
assertEquals(recordKeys.size(), cntSize);
// Test value stream
List<HoodieRecord<? extends HoodieRecordPayload>> values = records.valueStream().collect(Collectors.toList());
cntSize = 0;
for (HoodieRecord value : values) {
assert recordKeys.contains(value.getRecordKey());
cntSize++;
}
assertEquals(recordKeys.size(), cntSize);
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class TestExternalSpillableMap method testDataCorrectnessWithoutHoodieMetadata.
@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithoutHoodieMetadata(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
Schema schema = SchemaTestUtil.getSimpleSchema();
ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
isCompressionEnabled);
List<String> recordKeys = new ArrayList<>();
// Ensure we spill to disk
while (records.getDiskBasedMapNumEntries() < 1) {
List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 100);
hoodieRecords.stream().forEach(r -> {
records.put(r.getRecordKey(), r);
recordKeys.add(r.getRecordKey());
});
}
// Get a record from the in-Memory map
String key = recordKeys.get(0);
HoodieRecord record = records.get(key);
// Get the field we want to update
String fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
// Use a new value to update this field
String newValue = "update1";
List<HoodieRecord> recordsToUpdate = new ArrayList<>();
recordsToUpdate.add(record);
List<HoodieRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
// Upsert this updated record
updatedRecords.forEach(r -> {
records.put(r.getRecordKey(), r);
});
GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated value for the field name
assertEquals(gRecord.get(fieldName).toString(), newValue);
// Get a record from the disk based map
key = recordKeys.get(recordKeys.size() - 1);
record = records.get(key);
// Get the field we want to update
fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
// Use a new value to update this field
newValue = "update2";
recordsToUpdate = new ArrayList<>();
recordsToUpdate.add(record);
updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
// Upsert this updated record
updatedRecords.forEach(r -> {
records.put(r.getRecordKey(), r);
});
gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated value for the field name
assertEquals(gRecord.get(fieldName).toString(), newValue);
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class TestRocksDbDiskMap method testSimpleInsertSequential.
@Test
public void testSimpleInsertSequential() throws IOException, URISyntaxException {
RocksDbDiskMap<String, HoodieRecord<? extends HoodieRecordPayload>> rocksDBBasedMap = new RocksDbDiskMap<>(basePath);
List<String> recordKeys = setupMapWithRecords(rocksDBBasedMap, 100);
Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = rocksDBBasedMap.iterator();
int cntSize = 0;
while (itr.hasNext()) {
HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
cntSize++;
assert recordKeys.contains(rec.getRecordKey());
}
assertEquals(recordKeys.size(), cntSize);
// Test value stream
long currentTimeMs = System.currentTimeMillis();
List<HoodieRecord<? extends HoodieRecordPayload>> values = rocksDBBasedMap.valueStream().collect(Collectors.toList());
cntSize = 0;
for (HoodieRecord value : values) {
assert recordKeys.contains(value.getRecordKey());
cntSize++;
}
assertEquals(recordKeys.size(), cntSize);
}
use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.
the class DFSHoodieDatasetInputReader method readColumnarOrLogFiles.
private Iterator<IndexedRecord> readColumnarOrLogFiles(FileSlice fileSlice) throws IOException {
if (fileSlice.getBaseFile().isPresent()) {
// Read the base files using the latest writer schema.
Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), new Path(fileSlice.getBaseFile().get().getPath()));
return reader.getRecordIterator(schema);
} else {
// If there is no data file, fall back to reading log files
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(metaClient.getFs()).withBasePath(metaClient.getBasePath()).withLogFilePaths(fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList())).withReaderSchema(new Schema.Parser().parse(schemaStr)).withLatestInstantTime(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(true).withReverseReader(false).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
// readAvro log files
Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
Schema schema = new Schema.Parser().parse(schemaStr);
return StreamSupport.stream(iterable.spliterator(), false).map(e -> {
try {
return (IndexedRecord) e.getData().getInsertValue(schema).get();
} catch (IOException io) {
throw new UncheckedIOException(io);
}
}).iterator();
}
}
Aggregations