use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.
the class TestExternalSpillableMap method testDataCorrectnessWithUpsertsToDataInMapAndOnDisk.
@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
isCompressionEnabled);
List<String> recordKeys = new ArrayList<>();
// Ensure we spill to disk
while (records.getDiskBasedMapNumEntries() < 1) {
List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
recordKeys.addAll(SpillableMapTestUtils.upsertRecords(iRecords, records));
}
// Get a record from the in-Memory map
String key = recordKeys.get(0);
HoodieAvroRecord record = (HoodieAvroRecord) records.get(key);
List<IndexedRecord> recordsToUpdate = new ArrayList<>();
recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
List<String> keysToBeUpdated = new ArrayList<>();
keysToBeUpdated.add(key);
// Update the instantTime for this record
List<IndexedRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
// Upsert this updated record
SpillableMapTestUtils.upsertRecords(updatedRecords, records);
GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated commitTime
assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
// Get a record from the disk based map
key = recordKeys.get(recordKeys.size() - 1);
record = (HoodieAvroRecord) records.get(key);
recordsToUpdate = new ArrayList<>();
recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
keysToBeUpdated = new ArrayList<>();
keysToBeUpdated.add(key);
// Update the commitTime for this record
updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
// Upsert this updated record
SpillableMapTestUtils.upsertRecords(updatedRecords, records);
gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated instantTime
assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
}
use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.
the class TestExternalSpillableMap method simpleInsertTest.
@ParameterizedTest
@MethodSource("testArguments")
public void simpleInsertTest(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
isCompressionEnabled);
List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
assert (recordKeys.size() == 100);
// Test iterator
Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
int cntSize = 0;
while (itr.hasNext()) {
HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
cntSize++;
assert recordKeys.contains(rec.getRecordKey());
}
assertEquals(recordKeys.size(), cntSize);
// Test value stream
List<HoodieRecord<? extends HoodieRecordPayload>> values = records.valueStream().collect(Collectors.toList());
cntSize = 0;
for (HoodieRecord value : values) {
assert recordKeys.contains(value.getRecordKey());
cntSize++;
}
assertEquals(recordKeys.size(), cntSize);
}
use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.
the class TestExternalSpillableMap method testDataCorrectnessWithoutHoodieMetadata.
@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithoutHoodieMetadata(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
Schema schema = SchemaTestUtil.getSimpleSchema();
ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
isCompressionEnabled);
List<String> recordKeys = new ArrayList<>();
// Ensure we spill to disk
while (records.getDiskBasedMapNumEntries() < 1) {
List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 100);
hoodieRecords.stream().forEach(r -> {
records.put(r.getRecordKey(), r);
recordKeys.add(r.getRecordKey());
});
}
// Get a record from the in-Memory map
String key = recordKeys.get(0);
HoodieRecord record = records.get(key);
// Get the field we want to update
String fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
// Use a new value to update this field
String newValue = "update1";
List<HoodieRecord> recordsToUpdate = new ArrayList<>();
recordsToUpdate.add(record);
List<HoodieRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
// Upsert this updated record
updatedRecords.forEach(r -> {
records.put(r.getRecordKey(), r);
});
GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated value for the field name
assertEquals(gRecord.get(fieldName).toString(), newValue);
// Get a record from the disk based map
key = recordKeys.get(recordKeys.size() - 1);
record = records.get(key);
// Get the field we want to update
fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
// Use a new value to update this field
newValue = "update2";
recordsToUpdate = new ArrayList<>();
recordsToUpdate.add(record);
updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
// Upsert this updated record
updatedRecords.forEach(r -> {
records.put(r.getRecordKey(), r);
});
gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
// The record returned for this key should have the updated value for the field name
assertEquals(gRecord.get(fieldName).toString(), newValue);
}
use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.
the class TestBitCaskDiskMap method testSizeEstimatorPerformance.
/**
* @na: Leaving this test here for a quick performance test
*/
@Disabled
@Test
public void testSizeEstimatorPerformance() throws IOException, URISyntaxException {
// Test sizeEstimatorPerformance with simpleSchema
Schema schema = SchemaTestUtil.getSimpleSchema();
List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
HoodieRecordSizeEstimator sizeEstimator = new HoodieRecordSizeEstimator<>(schema);
HoodieRecord record = hoodieRecords.remove(0);
long startTime = System.currentTimeMillis();
SpillableMapUtils.computePayloadSize(record, sizeEstimator);
long timeTaken = System.currentTimeMillis() - startTime;
System.out.println("Time taken :" + timeTaken);
assertTrue(timeTaken < 100);
}
use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.
the class TestBitCaskDiskMap method testSizeEstimator.
@Test
public void testSizeEstimator() throws IOException, URISyntaxException {
Schema schema = SchemaTestUtil.getSimpleSchema();
// Test sizeEstimator without hoodie metadata fields
List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
long payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
// Test sizeEstimator with hoodie metadata fields
schema = HoodieAvroUtils.addMetadataFields(schema);
hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
// Following tests payloads without an Avro Schema in the Record
// Test sizeEstimator without hoodie metadata fields and without schema object in the payload
schema = SchemaTestUtil.getSimpleSchema();
List<IndexedRecord> indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
hoodieRecords = indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList());
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
// Test sizeEstimator with hoodie metadata fields and without schema object in the payload
final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
hoodieRecords = indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))).collect(Collectors.toList());
payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
assertTrue(payloadSize > 0);
}
Aggregations