Search in sources :

Example 16 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestHoodieBackedMetadata method verifyMetadataMergedRecords.

/**
 * Verify the metadata table in-memory merged records. Irrespective of key deduplication
 * config, the in-memory merged records should always have the key field in the record
 * payload fully materialized.
 *
 * @param metadataMetaClient    - Metadata table meta client
 * @param logFilePaths          - Metadata table log file paths
 * @param latestCommitTimestamp
 * @param enableMetaFields      - Enable meta fields
 */
private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClient, List<String> logFilePaths, String latestCommitTimestamp, boolean enableMetaFields) {
    Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
    if (enableMetaFields) {
        schema = HoodieAvroUtils.addMetadataFields(schema);
    }
    HoodieMetadataMergedLogRecordReader logRecordReader = HoodieMetadataMergedLogRecordReader.newBuilder().withFileSystem(metadataMetaClient.getFs()).withBasePath(metadataMetaClient.getBasePath()).withLogFilePaths(logFilePaths).withLatestInstantTime(latestCommitTimestamp).withPartition(MetadataPartitionType.FILES.getPartitionPath()).withReaderSchema(schema).withMaxMemorySizeInBytes(100000L).withBufferSize(4096).withSpillableMapBasePath(tempDir.toString()).withDiskMapType(ExternalSpillableMap.DiskMapType.BITCASK).build();
    assertDoesNotThrow(() -> {
        logRecordReader.scan();
    }, "Metadata log records materialization failed");
    for (Map.Entry<String, HoodieRecord<? extends HoodieRecordPayload>> entry : logRecordReader.getRecords().entrySet()) {
        assertFalse(entry.getKey().isEmpty());
        assertFalse(entry.getValue().getRecordKey().isEmpty());
        assertEquals(entry.getKey(), entry.getValue().getRecordKey());
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieMetadataMergedLogRecordReader(org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader) Map(java.util.Map) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) HashMap(java.util.HashMap) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 17 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestBulkInsertInternalPartitioner method testCustomColumnSortPartitioner.

@Test
public void testCustomColumnSortPartitioner() throws Exception {
    String sortColumnString = "rider";
    String[] sortColumns = sortColumnString.split(",");
    Comparator<HoodieRecord<? extends HoodieRecordPayload>> columnComparator = getCustomColumnComparator(HoodieTestDataGenerator.AVRO_SCHEMA, sortColumns);
    JavaRDD<HoodieRecord> records1 = generateTestRecordsForBulkInsert(jsc);
    JavaRDD<HoodieRecord> records2 = generateTripleTestRecordsForBulkInsert(jsc);
    testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(sortColumns, HoodieTestDataGenerator.AVRO_SCHEMA, false), records1, true, true, generateExpectedPartitionNumRecords(records1), Option.of(columnComparator));
    testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(sortColumns, HoodieTestDataGenerator.AVRO_SCHEMA, false), records2, true, true, generateExpectedPartitionNumRecords(records2), Option.of(columnComparator));
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath("/").withSchema(TRIP_EXAMPLE_SCHEMA).withUserDefinedBulkInsertPartitionerClass(RDDCustomColumnsSortPartitioner.class.getName()).withUserDefinedBulkInsertPartitionerSortColumns(sortColumnString).build();
    testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(config), records1, true, true, generateExpectedPartitionNumRecords(records1), Option.of(columnComparator));
    testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(config), records2, true, true, generateExpectedPartitionNumRecords(records2), Option.of(columnComparator));
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 18 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestBulkInsertInternalPartitioner method testBulkInsertInternalPartitioner.

private void testBulkInsertInternalPartitioner(BulkInsertPartitioner partitioner, JavaRDD<HoodieRecord> records, boolean isGloballySorted, boolean isLocallySorted, Map<String, Long> expectedPartitionNumRecords, Option<Comparator<HoodieRecord<? extends HoodieRecordPayload>>> comparator) {
    int numPartitions = 2;
    JavaRDD<HoodieRecord<? extends HoodieRecordPayload>> actualRecords = (JavaRDD<HoodieRecord<? extends HoodieRecordPayload>>) partitioner.repartitionRecords(records, numPartitions);
    assertEquals(numPartitions, actualRecords.getNumPartitions());
    List<HoodieRecord<? extends HoodieRecordPayload>> collectedActualRecords = actualRecords.collect();
    if (isGloballySorted) {
        // Verify global order
        verifyRecordAscendingOrder(collectedActualRecords, comparator);
    } else if (isLocallySorted) {
        // Verify local order
        actualRecords.mapPartitions(partition -> {
            List<HoodieRecord<? extends HoodieRecordPayload>> partitionRecords = new ArrayList<>();
            partition.forEachRemaining(partitionRecords::add);
            verifyRecordAscendingOrder(partitionRecords, comparator);
            return Collections.emptyList().iterator();
        }).collect();
    }
    // Verify number of records per partition path
    Map<String, Long> actualPartitionNumRecords = new HashMap<>();
    for (HoodieRecord record : collectedActualRecords) {
        String partitionPath = record.getPartitionPath();
        actualPartitionNumRecords.put(partitionPath, actualPartitionNumRecords.getOrDefault(partitionPath, 0L) + 1);
    }
    assertEquals(expectedPartitionNumRecords, actualPartitionNumRecords);
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 19 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieWriteableTestTable method appendRecordsToLogFile.

private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
    String partitionPath = groupedRecords.get(0).getPartitionPath();
    HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
    try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
        Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
        logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
            try {
                GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
                return (IndexedRecord) val;
            } catch (IOException e) {
                LOG.warn("Failed to convert record " + r.toString(), e);
                return null;
            }
        }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
        return Pair.of(partitionPath, logWriter.getLogFile());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieAvroParquetConfig(org.apache.hudi.io.storage.HoodieAvroParquetConfig) FileCreateUtils.baseFileName(org.apache.hudi.common.testutils.FileCreateUtils.baseFileName) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieParquetWriter(org.apache.hudi.io.storage.HoodieParquetWriter) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) CompressionKind(org.apache.orc.CompressionKind) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) HoodieOrcWriter(org.apache.hudi.io.storage.HoodieOrcWriter) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieOrcConfig(org.apache.hudi.io.storage.HoodieOrcConfig) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Paths(java.nio.file.Paths) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) LogManager(org.apache.log4j.LogManager) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 20 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestExternalSpillableMap method testAllMapOperations.

@ParameterizedTest
@MethodSource("testArguments")
public void testAllMapOperations(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException {
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    String payloadClazz = HoodieAvroPayload.class.getName();
    ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema), diskMapType, // 16B
    isCompressionEnabled);
    List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100);
    // insert a bunch of records so that values spill to disk too
    List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records);
    IndexedRecord inMemoryRecord = iRecords.get(0);
    String ikey = ((GenericRecord) inMemoryRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
    String iPartitionPath = ((GenericRecord) inMemoryRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
    HoodieRecord inMemoryHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(ikey, iPartitionPath), new HoodieAvroPayload(Option.of((GenericRecord) inMemoryRecord)));
    IndexedRecord onDiskRecord = iRecords.get(99);
    String dkey = ((GenericRecord) onDiskRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
    String dPartitionPath = ((GenericRecord) onDiskRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
    HoodieRecord onDiskHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(dkey, dPartitionPath), new HoodieAvroPayload(Option.of((GenericRecord) onDiskRecord)));
    // assert size
    assert records.size() == 100;
    // get should return the same HoodieKey, same location and same value
    assert inMemoryHoodieRecord.getKey().equals(records.get(ikey).getKey());
    assert onDiskHoodieRecord.getKey().equals(records.get(dkey).getKey());
    // compare the member variables of HoodieRecord not set by the constructor
    assert records.get(ikey).getCurrentLocation().getFileId().equals(SpillableMapTestUtils.DUMMY_FILE_ID);
    assert records.get(ikey).getCurrentLocation().getInstantTime().equals(SpillableMapTestUtils.DUMMY_COMMIT_TIME);
    // test contains
    assertTrue(records.containsKey(ikey));
    assertTrue(records.containsKey(dkey));
    // test isEmpty
    assertFalse(records.isEmpty());
    // test containsAll
    assertTrue(records.keySet().containsAll(recordKeys));
    // remove (from inMemory and onDisk)
    HoodieRecord removedRecord = records.remove(ikey);
    assertTrue(removedRecord != null);
    assertFalse(records.containsKey(ikey));
    removedRecord = records.remove(dkey);
    assertTrue(removedRecord != null);
    assertFalse(records.containsKey(dkey));
    // test clear
    records.clear();
    assertTrue(records.size() == 0);
}
Also used : HoodieRecordSizeEstimator(org.apache.hudi.common.util.HoodieRecordSizeEstimator) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)38 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)30 Schema (org.apache.avro.Schema)19 IOException (java.io.IOException)18 GenericRecord (org.apache.avro.generic.GenericRecord)18 IndexedRecord (org.apache.avro.generic.IndexedRecord)14 ArrayList (java.util.ArrayList)12 HashMap (java.util.HashMap)12 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)12 Option (org.apache.hudi.common.util.Option)12 Map (java.util.Map)11 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)11 List (java.util.List)9 Path (org.apache.hadoop.fs.Path)9 HoodieKey (org.apache.hudi.common.model.HoodieKey)9 Collectors (java.util.stream.Collectors)8 HoodieRecordSizeEstimator (org.apache.hudi.common.util.HoodieRecordSizeEstimator)8 Test (org.junit.jupiter.api.Test)8 UncheckedIOException (java.io.UncheckedIOException)7 Arrays (java.util.Arrays)7