Search in sources :

Example 1 with HoodieRecordSizeEstimator

use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.

the class TestExternalSpillableMap method testDataCorrectnessWithUpsertsToDataInMapAndOnDisk.

@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<String> recordKeys = new ArrayList<>();
    // Ensure we spill to disk
    while (records.getDiskBasedMapNumEntries() < 1) {
        List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
        recordKeys.addAll(SpillableMapTestUtils.upsertRecords(iRecords, records));
    }
    // Get a record from the in-Memory map
    String key = recordKeys.get(0);
    HoodieAvroRecord record = (HoodieAvroRecord) records.get(key);
    List<IndexedRecord> recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
    String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
    List<String> keysToBeUpdated = new ArrayList<>();
    keysToBeUpdated.add(key);
    // Update the instantTime for this record
    List<IndexedRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
    // Upsert this updated record
    SpillableMapTestUtils.upsertRecords(updatedRecords, records);
    GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated commitTime
    assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
    // Get a record from the disk based map
    key = recordKeys.get(recordKeys.size() - 1);
    record = (HoodieAvroRecord) records.get(key);
    recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
    newCommitTime = HoodieActiveTimeline.createNewInstantTime();
    keysToBeUpdated = new ArrayList<>();
    keysToBeUpdated.add(key);
    // Update the commitTime for this record
    updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
    // Upsert this updated record
    SpillableMapTestUtils.upsertRecords(updatedRecords, records);
    gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated instantTime
    assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) GenericRecord(org.apache.avro.generic.GenericRecord) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 2 with HoodieRecordSizeEstimator

use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.

the class TestExternalSpillableMap method simpleInsertTest.

@ParameterizedTest
@MethodSource("testArguments")
public void simpleInsertTest(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
    List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
    assert (recordKeys.size() == 100);
    // Test iterator
    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
    int cntSize = 0;
    while (itr.hasNext()) {
        HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
        cntSize++;
        assert recordKeys.contains(rec.getRecordKey());
    }
    assertEquals(recordKeys.size(), cntSize);
    // Test value stream
    List<HoodieRecord<? extends HoodieRecordPayload>> values = records.valueStream().collect(Collectors.toList());
    cntSize = 0;
    for (HoodieRecord value : values) {
        assert recordKeys.contains(value.getRecordKey());
        cntSize++;
    }
    assertEquals(recordKeys.size(), cntSize);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 3 with HoodieRecordSizeEstimator

use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.

the class TestExternalSpillableMap method testDataCorrectnessWithoutHoodieMetadata.

@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithoutHoodieMetadata(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = SchemaTestUtil.getSimpleSchema();
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<String> recordKeys = new ArrayList<>();
    // Ensure we spill to disk
    while (records.getDiskBasedMapNumEntries() < 1) {
        List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 100);
        hoodieRecords.stream().forEach(r -> {
            records.put(r.getRecordKey(), r);
            recordKeys.add(r.getRecordKey());
        });
    }
    // Get a record from the in-Memory map
    String key = recordKeys.get(0);
    HoodieRecord record = records.get(key);
    // Get the field we want to update
    String fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
    // Use a new value to update this field
    String newValue = "update1";
    List<HoodieRecord> recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add(record);
    List<HoodieRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
    // Upsert this updated record
    updatedRecords.forEach(r -> {
        records.put(r.getRecordKey(), r);
    });
    GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated value for the field name
    assertEquals(gRecord.get(fieldName).toString(), newValue);
    // Get a record from the disk based map
    key = recordKeys.get(recordKeys.size() - 1);
    record = records.get(key);
    // Get the field we want to update
    fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
    // Use a new value to update this field
    newValue = "update2";
    recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add(record);
    updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
    // Upsert this updated record
    updatedRecords.forEach(r -> {
        records.put(r.getRecordKey(), r);
    });
    gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated value for the field name
    assertEquals(gRecord.get(fieldName).toString(), newValue);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) URISyntaxException(java.net.URISyntaxException) Option(org.apache.hudi.common.util.Option) HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) ArrayList(java.util.ArrayList) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) SpillableMapTestUtils(org.apache.hudi.common.testutils.SpillableMapTestUtils) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) MethodSource(org.junit.jupiter.params.provider.MethodSource) Alphanumeric(org.junit.jupiter.api.MethodOrderer.Alphanumeric) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) TestMethodOrder(org.junit.jupiter.api.TestMethodOrder) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) IOException(java.io.IOException) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieKey(org.apache.hudi.common.model.HoodieKey) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) GenericRecord(org.apache.avro.generic.GenericRecord) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 4 with HoodieRecordSizeEstimator

use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.

the class TestBitCaskDiskMap method testSizeEstimatorPerformance.

/**
 * @na: Leaving this test here for a quick performance test
 */
@Disabled
@Test
public void testSizeEstimatorPerformance() throws IOException, URISyntaxException {
    // Test sizeEstimatorPerformance with simpleSchema
    Schema schema = SchemaTestUtil.getSimpleSchema();
    List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
    HoodieRecordSizeEstimator sizeEstimator = new HoodieRecordSizeEstimator<>(schema);
    HoodieRecord record = hoodieRecords.remove(0);
    long startTime = System.currentTimeMillis();
    SpillableMapUtils.computePayloadSize(record, sizeEstimator);
    long timeTaken = System.currentTimeMillis() - startTime;
    System.out.println("Time taken :" + timeTaken);
    assertTrue(timeTaken < 100);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) Schema(org.apache.avro.Schema) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Disabled(org.junit.jupiter.api.Disabled)

Example 5 with HoodieRecordSizeEstimator

use of org.apache.hudi.common.util.HoodieRecordSizeEstimator in project hudi by apache.

the class TestBitCaskDiskMap method testSizeEstimator.

@Test
public void testSizeEstimator() throws IOException, URISyntaxException {
    Schema schema = SchemaTestUtil.getSimpleSchema();
    // Test sizeEstimator without hoodie metadata fields
    List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
    long payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
    assertTrue(payloadSize > 0);
    // Test sizeEstimator with hoodie metadata fields
    schema = HoodieAvroUtils.addMetadataFields(schema);
    hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema);
    payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
    assertTrue(payloadSize > 0);
    // Following tests payloads without an Avro Schema in the Record
    // Test sizeEstimator without hoodie metadata fields and without schema object in the payload
    schema = SchemaTestUtil.getSimpleSchema();
    List<IndexedRecord> indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
    hoodieRecords = indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList());
    payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
    assertTrue(payloadSize > 0);
    // Test sizeEstimator with hoodie metadata fields and without schema object in the payload
    final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1);
    hoodieRecords = indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))).collect(Collectors.toList());
    payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema));
    assertTrue(payloadSize > 0);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) AvroBinaryTestPayload(org.apache.hudi.common.testutils.AvroBinaryTestPayload) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) Schema(org.apache.avro.Schema) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieRecordSizeEstimator (org.apache.hudi.common.util.HoodieRecordSizeEstimator)11 Schema (org.apache.avro.Schema)9 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)9 DefaultSizeEstimator (org.apache.hudi.common.util.DefaultSizeEstimator)9 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)9 IndexedRecord (org.apache.avro.generic.IndexedRecord)7 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 MethodSource (org.junit.jupiter.params.provider.MethodSource)6 IOException (java.io.IOException)5 Test (org.junit.jupiter.api.Test)4 UncheckedIOException (java.io.UncheckedIOException)3 ArrayList (java.util.ArrayList)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 HoodieAvroPayload (org.apache.hudi.common.model.HoodieAvroPayload)2 SchemaTestUtil.getSimpleSchema (org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema)2 HoodieIOException (org.apache.hudi.exception.HoodieIOException)2 URISyntaxException (java.net.URISyntaxException)1 Iterator (java.util.Iterator)1