Search in sources :

Example 31 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsUpdatePartitionPath.

/**
 * This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition
 * compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted
 * in the new partition.
 * test structure:
 * 1. insert 1 batch
 * 2. insert 2nd batch with larger no of records so that a new file group is created for partitions
 * 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new
 * records are upserted to the new partition
 *
 * @param indexType index type to be tested for
 * @param config instance of {@link HoodieWriteConfig} to use
 * @param writeFn write function to be used for testing
 */
private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
    // instantiate client
    HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withBloomIndexUpdatePartitionPath(true).withGlobalSimpleIndexUpdatePartitionPath(true).build()).withTimelineLayoutVersion(VERSION_0).build();
    HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
    // Set rollback to LAZY so no inflights are deleted
    hoodieWriteConfig.getProps().put(HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), HoodieFailedWritesCleaningPolicy.LAZY.name());
    SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
    // Write 1
    String newCommitTime = "001";
    int numRecords = 10;
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
    Set<Pair<String, String>> expectedPartitionPathRecKeyPairs = new HashSet<>();
    // populate expected partition path and record keys
    for (HoodieRecord rec : records) {
        expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
    }
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    String[] fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
    // verify one basefile per partition
    String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new));
    Map<String, Long> baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths);
    for (Map.Entry<String, Long> entry : baseFileCounts.entrySet()) {
        assertEquals(1, entry.getValue());
    }
    assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1));
    // Write 2
    newCommitTime = "002";
    // so that a new file id is created
    numRecords = 20;
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords);
    // populate expected partition path and record keys
    for (HoodieRecord rec : recordsSecondBatch) {
        expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
    }
    writeRecords = jsc.parallelize(recordsSecondBatch, 1);
    result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
    // verify that there are more than 1 basefiles per partition
    // we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile.
    baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths);
    assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, "At least one partition should have more than 1 base file after 2nd batch of writes");
    // Write 3 (upserts to records from batch 1 with diff partition path)
    newCommitTime = "003";
    // update to diff partition paths
    List<HoodieRecord> recordsToUpsert = new ArrayList<>();
    for (HoodieRecord rec : records) {
        // remove older entry from expected partition path record key pairs
        expectedPartitionPathRecKeyPairs.remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
        String partitionPath = rec.getPartitionPath();
        String newPartitionPath = null;
        if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_SECOND_PARTITION_PATH;
        } else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_THIRD_PARTITION_PATH;
        } else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_FIRST_PARTITION_PATH;
        } else {
            throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath());
        }
        recordsToUpsert.add(new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), (HoodieRecordPayload) rec.getData()));
        // populate expected partition path and record keys
        expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey()));
    }
    writeRecords = jsc.parallelize(recordsToUpsert, 1);
    result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SparkSingleFileSortPlanStrategy(org.apache.hudi.client.clustering.plan.strategy.SparkSingleFileSortPlanStrategy) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) HoodieWriteHelper(org.apache.hudi.table.action.commit.HoodieWriteHelper) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Future(java.util.concurrent.Future) Map(java.util.Map) EAGER(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy.EAGER) Tag(org.junit.jupiter.api.Tag) HoodieWriteResult(org.apache.hudi.client.HoodieWriteResult) REQUESTED(org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) BaseHoodieWriteClient(org.apache.hudi.client.BaseHoodieWriteClient) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) VERSION_0(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion.VERSION_0) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) Executors(java.util.concurrent.Executors) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) SqlQuerySingleResultPreCommitValidator(org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator) DEFAULT_THIRD_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) Mockito.mock(org.mockito.Mockito.mock) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Dataset(org.apache.spark.sql.Dataset) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) DEFAULT_FIRST_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) StringUtils(org.apache.hudi.common.util.StringUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) Transformations.recordsToRecordKeySet(org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet) EXECUTION_STRATEGY_CLASS_NAME(org.apache.hudi.config.HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieData(org.apache.hudi.common.data.HoodieData) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) SqlQueryEqualityPreCommitValidator(org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator) DEFAULT_SECOND_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieCorruptedDataException(org.apache.hudi.exception.HoodieCorruptedDataException) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) COMPLETED(org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED) REPLACE_COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) CLEAN_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException) SparkSingleFileSortExecutionStrategy(org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy) HoodiePreCommitValidatorConfig(org.apache.hudi.config.HoodiePreCommitValidatorConfig) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) IOType(org.apache.hudi.common.model.IOType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) MarkerUtils(org.apache.hudi.common.util.MarkerUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) FileCreateUtils.getBaseFileCountsForPaths(org.apache.hudi.common.testutils.FileCreateUtils.getBaseFileCountsForPaths) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) ROLLBACK_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) NotNull(org.jetbrains.annotations.NotNull) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) Transformations.randomSelectAsHoodieKeys(org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys) INFLIGHT(org.apache.hudi.common.table.timeline.HoodieInstant.State.INFLIGHT) COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) ASYNC_CLUSTERING_ENABLE(org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE) TypedProperties(org.apache.hudi.common.config.TypedProperties) NULL_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.NULL_SCHEMA) Mockito.when(org.mockito.Mockito.when) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ClusteringTestUtils(org.apache.hudi.common.testutils.ClusteringTestUtils) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) Map(java.util.Map) HashMap(java.util.HashMap) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HashSet(java.util.HashSet)

Example 32 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestBulkInsertInternalPartitioner method getCustomColumnComparator.

private Comparator<HoodieRecord<? extends HoodieRecordPayload>> getCustomColumnComparator(Schema schema, String[] sortColumns) {
    Comparator<HoodieRecord<? extends HoodieRecordPayload>> comparator = Comparator.comparing(record -> {
        try {
            GenericRecord genericRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
            StringBuilder sb = new StringBuilder();
            for (String col : sortColumns) {
                sb.append(genericRecord.get(col));
            }
            return sb.toString();
        } catch (IOException e) {
            throw new HoodieIOException("unable to read value for " + sortColumns);
        }
    });
    return comparator;
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 33 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieMetadataPayload method combineAndGetUpdateValue.

@Override
public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRecord, Schema schema, Properties properties) throws IOException {
    HoodieMetadataPayload anotherPayload = new HoodieMetadataPayload(Option.of((GenericRecord) oldRecord));
    HoodieRecordPayload combinedPayload = preCombine(anotherPayload);
    return combinedPayload.getInsertValue(schema, properties);
}
Also used : GenericRecord(org.apache.avro.generic.GenericRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 34 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieBackedTableMetadata method readFromBaseAndMergeWithLogRecords.

private List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> readFromBaseAndMergeWithLogRecords(HoodieFileReader baseFileReader, List<String> keys, Map<String, Option<HoodieRecord<HoodieMetadataPayload>>> logRecords, List<Long> timings, String partitionName) throws IOException {
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> result = new ArrayList<>();
    // merge with base records
    HoodieTimer timer = new HoodieTimer().startTimer();
    timer.startTimer();
    HoodieRecord<HoodieMetadataPayload> hoodieRecord = null;
    // Retrieve record from base file
    if (baseFileReader != null) {
        HoodieTimer readTimer = new HoodieTimer();
        Map<String, GenericRecord> baseFileRecords = baseFileReader.getRecordsByKeys(keys);
        for (String key : keys) {
            readTimer.startTimer();
            if (baseFileRecords.containsKey(key)) {
                hoodieRecord = getRecord(Option.of(baseFileRecords.get(key)), partitionName);
                metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer()));
                // merge base file record w/ log record if present
                if (logRecords.containsKey(key) && logRecords.get(key).isPresent()) {
                    HoodieRecordPayload mergedPayload = logRecords.get(key).get().getData().preCombine(hoodieRecord.getData());
                    result.add(Pair.of(key, Option.of(new HoodieAvroRecord(hoodieRecord.getKey(), mergedPayload))));
                } else {
                    // only base record
                    result.add(Pair.of(key, Option.of(hoodieRecord)));
                }
            } else {
                // only log record
                result.add(Pair.of(key, logRecords.get(key)));
            }
        }
        timings.add(timer.endTimer());
    } else {
        // no base file at all
        timings.add(timer.endTimer());
        for (Map.Entry<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : logRecords.entrySet()) {
            result.add(Pair.of(entry.getKey(), entry.getValue()));
        }
    }
    return result;
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Option(org.apache.hudi.common.util.Option) GenericRecord(org.apache.avro.generic.GenericRecord) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Pair(org.apache.hudi.common.util.collection.Pair)

Example 35 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class TestHoodieLogFileCommand method testShowLogFileRecordsWithMerge.

/**
 * Test case for 'show logfile records' with merge.
 */
@Test
public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedException, URISyntaxException {
    // create commit instant
    HoodieTestCommitMetadataGenerator.createCommitFile(tablePath, INSTANT_TIME, HoodieCLI.conf);
    // write to path '2015/03/16'.
    Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
    partitionPath = tablePath + Path.SEPARATOR + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH;
    Files.createDirectories(Paths.get(partitionPath));
    HoodieLogFormat.Writer writer = null;
    try {
        // set little threshold to split file.
        writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-log-fileid1").overBaseCommit(INSTANT_TIME).withFs(fs).withSizeThreshold(500).build();
        List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
        Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
        HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
        writer.appendBlock(dataBlock);
    } finally {
        if (writer != null) {
            writer.close();
        }
    }
    CommandResult cr = shell().executeCommand("show logfile records --logFilePathPattern " + partitionPath + "/* --mergeRecords true");
    assertTrue(cr.isSuccess());
    // get expected result of 10 records.
    List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(partitionPath + "/*"))).map(status -> status.getPath().toString()).collect(Collectors.toList());
    HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(tablePath).withLogFilePaths(logFilePaths).withReaderSchema(schema).withLatestInstantTime(INSTANT_TIME).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
    Iterator<HoodieRecord<? extends HoodieRecordPayload>> records = scanner.iterator();
    int num = 0;
    int maxSize = 10;
    List<IndexedRecord> indexRecords = new ArrayList<>();
    while (records.hasNext() && num < maxSize) {
        Option<IndexedRecord> hoodieRecord = records.next().getData().getInsertValue(schema);
        indexRecords.add(hoodieRecord.get());
        num++;
    }
    String[][] rows = indexRecords.stream().map(r -> new String[] { r.toString() }).toArray(String[][]::new);
    assertNotNull(rows);
    String expected = HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
    expected = removeNonWordAndStripSpace(expected);
    String got = removeNonWordAndStripSpace(cr.getResult().toString());
    assertEquals(expected, got);
}
Also used : Path(org.apache.hadoop.fs.Path) BeforeEach(org.junit.jupiter.api.BeforeEach) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) CLIFunctionalTestHarness(org.apache.hudi.cli.functional.CLIFunctionalTestHarness) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) Files(java.nio.file.Files) TableHeader(org.apache.hudi.cli.TableHeader) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) HoodieTestCommitMetadataGenerator(org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieCLI(org.apache.hudi.cli.HoodieCLI) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) Paths(java.nio.file.Paths) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CommandResult(org.springframework.shell.core.CommandResult) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) FSUtils(org.apache.hudi.common.fs.FSUtils) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) Schema(org.apache.avro.Schema) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) CommandResult(org.springframework.shell.core.CommandResult) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)38 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)30 Schema (org.apache.avro.Schema)19 IOException (java.io.IOException)18 GenericRecord (org.apache.avro.generic.GenericRecord)18 IndexedRecord (org.apache.avro.generic.IndexedRecord)14 ArrayList (java.util.ArrayList)12 HashMap (java.util.HashMap)12 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)12 Option (org.apache.hudi.common.util.Option)12 Map (java.util.Map)11 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)11 List (java.util.List)9 Path (org.apache.hadoop.fs.Path)9 HoodieKey (org.apache.hudi.common.model.HoodieKey)9 Collectors (java.util.stream.Collectors)8 HoodieRecordSizeEstimator (org.apache.hudi.common.util.HoodieRecordSizeEstimator)8 Test (org.junit.jupiter.api.Test)8 UncheckedIOException (java.io.UncheckedIOException)7 Arrays (java.util.Arrays)7