Search in sources :

Example 1 with DefaultSizeEstimator

use of org.apache.hudi.common.util.DefaultSizeEstimator in project hudi by apache.

the class SpillableMapBasedFileSystemView method createFileIdToBootstrapBaseFileMap.

@Override
protected Map<HoodieFileGroupId, BootstrapBaseFileMapping> createFileIdToBootstrapBaseFileMap(Map<HoodieFileGroupId, BootstrapBaseFileMapping> fileGroupIdBootstrapBaseFileMap) {
    try {
        LOG.info("Creating bootstrap base File Map using external spillable Map. Max Mem=" + maxMemoryForBootstrapBaseFile + ", BaseDir=" + baseStoreDir);
        new File(baseStoreDir).mkdirs();
        Map<HoodieFileGroupId, BootstrapBaseFileMapping> pendingMap = new ExternalSpillableMap<>(maxMemoryForBootstrapBaseFile, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), diskMapType, isBitCaskDiskMapCompressionEnabled);
        pendingMap.putAll(fileGroupIdBootstrapBaseFileMap);
        return pendingMap;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) IOException(java.io.IOException) BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) File(java.io.File)

Example 2 with DefaultSizeEstimator

use of org.apache.hudi.common.util.DefaultSizeEstimator in project hudi by apache.

the class SpillableMapBasedFileSystemView method createPartitionToFileGroups.

@Override
protected Map<String, List<HoodieFileGroup>> createPartitionToFileGroups() {
    try {
        LOG.info("Creating Partition To File groups map using external spillable Map. Max Mem=" + maxMemoryForFileGroupMap + ", BaseDir=" + baseStoreDir);
        new File(baseStoreDir).mkdirs();
        return (Map<String, List<HoodieFileGroup>>) (new ExternalSpillableMap<>(maxMemoryForFileGroupMap, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), diskMapType, isBitCaskDiskMapCompressionEnabled));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) IOException(java.io.IOException) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) File(java.io.File) Map(java.util.Map) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 3 with DefaultSizeEstimator

use of org.apache.hudi.common.util.DefaultSizeEstimator in project hudi by apache.

the class TestExternalSpillableMap method testDataCorrectnessWithUpsertsToDataInMapAndOnDisk.

@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<String> recordKeys = new ArrayList<>();
    // Ensure we spill to disk
    while (records.getDiskBasedMapNumEntries() < 1) {
        List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
        recordKeys.addAll(SpillableMapTestUtils.upsertRecords(iRecords, records));
    }
    // Get a record from the in-Memory map
    String key = recordKeys.get(0);
    HoodieAvroRecord record = (HoodieAvroRecord) records.get(key);
    List<IndexedRecord> recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
    String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
    List<String> keysToBeUpdated = new ArrayList<>();
    keysToBeUpdated.add(key);
    // Update the instantTime for this record
    List<IndexedRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
    // Upsert this updated record
    SpillableMapTestUtils.upsertRecords(updatedRecords, records);
    GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated commitTime
    assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
    // Get a record from the disk based map
    key = recordKeys.get(recordKeys.size() - 1);
    record = (HoodieAvroRecord) records.get(key);
    recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
    newCommitTime = HoodieActiveTimeline.createNewInstantTime();
    keysToBeUpdated = new ArrayList<>();
    keysToBeUpdated.add(key);
    // Update the commitTime for this record
    updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
    // Upsert this updated record
    SpillableMapTestUtils.upsertRecords(updatedRecords, records);
    gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated instantTime
    assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) GenericRecord(org.apache.avro.generic.GenericRecord) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 4 with DefaultSizeEstimator

use of org.apache.hudi.common.util.DefaultSizeEstimator in project hudi by apache.

the class TestExternalSpillableMap method simpleInsertTest.

@ParameterizedTest
@MethodSource("testArguments")
public void simpleInsertTest(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
    List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
    assert (recordKeys.size() == 100);
    // Test iterator
    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
    int cntSize = 0;
    while (itr.hasNext()) {
        HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
        cntSize++;
        assert recordKeys.contains(rec.getRecordKey());
    }
    assertEquals(recordKeys.size(), cntSize);
    // Test value stream
    List<HoodieRecord<? extends HoodieRecordPayload>> values = records.valueStream().collect(Collectors.toList());
    cntSize = 0;
    for (HoodieRecord value : values) {
        assert recordKeys.contains(value.getRecordKey());
        cntSize++;
    }
    assertEquals(recordKeys.size(), cntSize);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 5 with DefaultSizeEstimator

use of org.apache.hudi.common.util.DefaultSizeEstimator in project hudi by apache.

the class TestExternalSpillableMap method testDataCorrectnessWithoutHoodieMetadata.

@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithoutHoodieMetadata(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = SchemaTestUtil.getSimpleSchema();
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<String> recordKeys = new ArrayList<>();
    // Ensure we spill to disk
    while (records.getDiskBasedMapNumEntries() < 1) {
        List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 100);
        hoodieRecords.stream().forEach(r -> {
            records.put(r.getRecordKey(), r);
            recordKeys.add(r.getRecordKey());
        });
    }
    // Get a record from the in-Memory map
    String key = recordKeys.get(0);
    HoodieRecord record = records.get(key);
    // Get the field we want to update
    String fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
    // Use a new value to update this field
    String newValue = "update1";
    List<HoodieRecord> recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add(record);
    List<HoodieRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
    // Upsert this updated record
    updatedRecords.forEach(r -> {
        records.put(r.getRecordKey(), r);
    });
    GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated value for the field name
    assertEquals(gRecord.get(fieldName).toString(), newValue);
    // Get a record from the disk based map
    key = recordKeys.get(recordKeys.size() - 1);
    record = records.get(key);
    // Get the field we want to update
    fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
    // Use a new value to update this field
    newValue = "update2";
    recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add(record);
    updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
    // Upsert this updated record
    updatedRecords.forEach(r -> {
        records.put(r.getRecordKey(), r);
    });
    gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated value for the field name
    assertEquals(gRecord.get(fieldName).toString(), newValue);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) URISyntaxException(java.net.URISyntaxException) Option(org.apache.hudi.common.util.Option) HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) ArrayList(java.util.ArrayList) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) SpillableMapTestUtils(org.apache.hudi.common.testutils.SpillableMapTestUtils) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) MethodSource(org.junit.jupiter.params.provider.MethodSource) Alphanumeric(org.junit.jupiter.api.MethodOrderer.Alphanumeric) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) TestMethodOrder(org.junit.jupiter.api.TestMethodOrder) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) IOException(java.io.IOException) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieKey(org.apache.hudi.common.model.HoodieKey) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) GenericRecord(org.apache.avro.generic.GenericRecord) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

DefaultSizeEstimator (org.apache.hudi.common.util.DefaultSizeEstimator)15 IOException (java.io.IOException)10 HoodieRecordSizeEstimator (org.apache.hudi.common.util.HoodieRecordSizeEstimator)9 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)8 Schema (org.apache.avro.Schema)7 IndexedRecord (org.apache.avro.generic.IndexedRecord)7 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)7 MethodSource (org.junit.jupiter.params.provider.MethodSource)6 File (java.io.File)5 ExternalSpillableMap (org.apache.hudi.common.util.collection.ExternalSpillableMap)5 HoodieFileGroupId (org.apache.hudi.common.model.HoodieFileGroupId)4 UncheckedIOException (java.io.UncheckedIOException)3 ArrayList (java.util.ArrayList)3 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 Test (org.junit.jupiter.api.Test)3 GenericRecord (org.apache.avro.generic.GenericRecord)2 HoodieAvroPayload (org.apache.hudi.common.model.HoodieAvroPayload)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)2