Search in sources :

Example 11 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestExternalSpillableMap method testDataCorrectnessWithUpsertsToDataInMapAndOnDisk.

@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<String> recordKeys = new ArrayList<>();
    // Ensure we spill to disk
    while (records.getDiskBasedMapNumEntries() < 1) {
        List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
        recordKeys.addAll(SpillableMapTestUtils.upsertRecords(iRecords, records));
    }
    // Get a record from the in-Memory map
    String key = recordKeys.get(0);
    HoodieAvroRecord record = (HoodieAvroRecord) records.get(key);
    List<IndexedRecord> recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
    String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
    List<String> keysToBeUpdated = new ArrayList<>();
    keysToBeUpdated.add(key);
    // Update the instantTime for this record
    List<IndexedRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
    // Upsert this updated record
    SpillableMapTestUtils.upsertRecords(updatedRecords, records);
    GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated commitTime
    assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
    // Get a record from the disk based map
    key = recordKeys.get(recordKeys.size() - 1);
    record = (HoodieAvroRecord) records.get(key);
    recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get());
    newCommitTime = HoodieActiveTimeline.createNewInstantTime();
    keysToBeUpdated = new ArrayList<>();
    keysToBeUpdated.add(key);
    // Update the commitTime for this record
    updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
    // Upsert this updated record
    SpillableMapTestUtils.upsertRecords(updatedRecords, records);
    gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated instantTime
    assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) GenericRecord(org.apache.avro.generic.GenericRecord) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 12 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestExternalSpillableMap method simpleInsertTest.

@ParameterizedTest
@MethodSource("testArguments")
public void simpleInsertTest(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
    List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
    assert (recordKeys.size() == 100);
    // Test iterator
    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator();
    int cntSize = 0;
    while (itr.hasNext()) {
        HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
        cntSize++;
        assert recordKeys.contains(rec.getRecordKey());
    }
    assertEquals(recordKeys.size(), cntSize);
    // Test value stream
    List<HoodieRecord<? extends HoodieRecordPayload>> values = records.valueStream().collect(Collectors.toList());
    cntSize = 0;
    for (HoodieRecord value : values) {
        assert recordKeys.contains(value.getRecordKey());
        cntSize++;
    }
    assertEquals(recordKeys.size(), cntSize);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 13 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestExternalSpillableMap method testDataCorrectnessWithoutHoodieMetadata.

@ParameterizedTest
@MethodSource("testArguments")
public void testDataCorrectnessWithoutHoodieMetadata(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = SchemaTestUtil.getSimpleSchema();
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<String> recordKeys = new ArrayList<>();
    // Ensure we spill to disk
    while (records.getDiskBasedMapNumEntries() < 1) {
        List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 100);
        hoodieRecords.stream().forEach(r -> {
            records.put(r.getRecordKey(), r);
            recordKeys.add(r.getRecordKey());
        });
    }
    // Get a record from the in-Memory map
    String key = recordKeys.get(0);
    HoodieRecord record = records.get(key);
    // Get the field we want to update
    String fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
    // Use a new value to update this field
    String newValue = "update1";
    List<HoodieRecord> recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add(record);
    List<HoodieRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
    // Upsert this updated record
    updatedRecords.forEach(r -> {
        records.put(r.getRecordKey(), r);
    });
    GenericRecord gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated value for the field name
    assertEquals(gRecord.get(fieldName).toString(), newValue);
    // Get a record from the disk based map
    key = recordKeys.get(recordKeys.size() - 1);
    record = records.get(key);
    // Get the field we want to update
    fieldName = schema.getFields().stream().filter(field -> field.schema().getType() == Schema.Type.STRING).findAny().get().name();
    // Use a new value to update this field
    newValue = "update2";
    recordsToUpdate = new ArrayList<>();
    recordsToUpdate.add(record);
    updatedRecords = SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
    // Upsert this updated record
    updatedRecords.forEach(r -> {
        records.put(r.getRecordKey(), r);
    });
    gRecord = (GenericRecord) records.get(key).getData().getInsertValue(schema).get();
    // The record returned for this key should have the updated value for the field name
    assertEquals(gRecord.get(fieldName).toString(), newValue);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) URISyntaxException(java.net.URISyntaxException) Option(org.apache.hudi.common.util.Option) HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) ArrayList(java.util.ArrayList) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) SpillableMapTestUtils(org.apache.hudi.common.testutils.SpillableMapTestUtils) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) MethodSource(org.junit.jupiter.params.provider.MethodSource) Alphanumeric(org.junit.jupiter.api.MethodOrderer.Alphanumeric) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) TestMethodOrder(org.junit.jupiter.api.TestMethodOrder) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) IOException(java.io.IOException) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieKey(org.apache.hudi.common.model.HoodieKey) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) GenericRecord(org.apache.avro.generic.GenericRecord) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 14 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestRocksDbDiskMap method testSimpleInsertSequential.

@Test
public void testSimpleInsertSequential() throws IOException, URISyntaxException {
    RocksDbDiskMap<String, HoodieRecord<? extends HoodieRecordPayload>> rocksDBBasedMap = new RocksDbDiskMap<>(basePath);
    List<String> recordKeys = setupMapWithRecords(rocksDBBasedMap, 100);
    Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = rocksDBBasedMap.iterator();
    int cntSize = 0;
    while (itr.hasNext()) {
        HoodieRecord<? extends HoodieRecordPayload> rec = itr.next();
        cntSize++;
        assert recordKeys.contains(rec.getRecordKey());
    }
    assertEquals(recordKeys.size(), cntSize);
    // Test value stream
    long currentTimeMs = System.currentTimeMillis();
    List<HoodieRecord<? extends HoodieRecordPayload>> values = rocksDBBasedMap.valueStream().collect(Collectors.toList());
    cntSize = 0;
    for (HoodieRecord value : values) {
        assert recordKeys.contains(value.getRecordKey());
        cntSize++;
    }
    assertEquals(recordKeys.size(), cntSize);
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Test(org.junit.jupiter.api.Test)

Example 15 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class DFSHoodieDatasetInputReader method readColumnarOrLogFiles.

private Iterator<IndexedRecord> readColumnarOrLogFiles(FileSlice fileSlice) throws IOException {
    if (fileSlice.getBaseFile().isPresent()) {
        // Read the base files using the latest writer schema.
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
        HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), new Path(fileSlice.getBaseFile().get().getPath()));
        return reader.getRecordIterator(schema);
    } else {
        // If there is no data file, fall back to reading log files
        HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(metaClient.getFs()).withBasePath(metaClient.getBasePath()).withLogFilePaths(fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList())).withReaderSchema(new Schema.Parser().parse(schemaStr)).withLatestInstantTime(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(true).withReverseReader(false).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
        // readAvro log files
        Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
        Schema schema = new Schema.Parser().parse(schemaStr);
        return StreamSupport.stream(iterable.spliterator(), false).map(e -> {
            try {
                return (IndexedRecord) e.getData().getInsertValue(schema).get();
            } catch (IOException io) {
                throw new UncheckedIOException(io);
            }
        }).iterator();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Entry.comparingByValue(java.util.Map.Entry.comparingByValue) LinkedHashMap(java.util.LinkedHashMap) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Collectors.toMap(java.util.stream.Collectors.toMap) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) StreamSupport(java.util.stream.StreamSupport) NoSuchElementException(java.util.NoSuchElementException) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) IOException(java.io.IOException) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) UncheckedIOException(java.io.UncheckedIOException) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Aggregations

HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)38 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)30 Schema (org.apache.avro.Schema)19 IOException (java.io.IOException)18 GenericRecord (org.apache.avro.generic.GenericRecord)18 IndexedRecord (org.apache.avro.generic.IndexedRecord)14 ArrayList (java.util.ArrayList)12 HashMap (java.util.HashMap)12 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)12 Option (org.apache.hudi.common.util.Option)12 Map (java.util.Map)11 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)11 List (java.util.List)9 Path (org.apache.hadoop.fs.Path)9 HoodieKey (org.apache.hudi.common.model.HoodieKey)9 Collectors (java.util.stream.Collectors)8 HoodieRecordSizeEstimator (org.apache.hudi.common.util.HoodieRecordSizeEstimator)8 Test (org.junit.jupiter.api.Test)8 UncheckedIOException (java.io.UncheckedIOException)7 Arrays (java.util.Arrays)7