Search in sources :

Example 1 with DEFAULT_THIRD_PARTITION_PATH

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsUpdatePartitionPath.

/**
 * This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition
 * compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted
 * in the new partition.
 * test structure:
 * 1. insert 1 batch
 * 2. insert 2nd batch with larger no of records so that a new file group is created for partitions
 * 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new
 * records are upserted to the new partition
 *
 * @param indexType index type to be tested for
 * @param config instance of {@link HoodieWriteConfig} to use
 * @param writeFn write function to be used for testing
 */
private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
    // instantiate client
    HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withBloomIndexUpdatePartitionPath(true).withGlobalSimpleIndexUpdatePartitionPath(true).build()).withTimelineLayoutVersion(VERSION_0).build();
    HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
    // Set rollback to LAZY so no inflights are deleted
    hoodieWriteConfig.getProps().put(HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), HoodieFailedWritesCleaningPolicy.LAZY.name());
    SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
    // Write 1
    String newCommitTime = "001";
    int numRecords = 10;
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
    Set<Pair<String, String>> expectedPartitionPathRecKeyPairs = new HashSet<>();
    // populate expected partition path and record keys
    for (HoodieRecord rec : records) {
        expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
    }
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    String[] fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
    // verify one basefile per partition
    String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new));
    Map<String, Long> baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths);
    for (Map.Entry<String, Long> entry : baseFileCounts.entrySet()) {
        assertEquals(1, entry.getValue());
    }
    assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1));
    // Write 2
    newCommitTime = "002";
    // so that a new file id is created
    numRecords = 20;
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords);
    // populate expected partition path and record keys
    for (HoodieRecord rec : recordsSecondBatch) {
        expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
    }
    writeRecords = jsc.parallelize(recordsSecondBatch, 1);
    result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
    // verify that there are more than 1 basefiles per partition
    // we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile.
    baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths);
    assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, "At least one partition should have more than 1 base file after 2nd batch of writes");
    // Write 3 (upserts to records from batch 1 with diff partition path)
    newCommitTime = "003";
    // update to diff partition paths
    List<HoodieRecord> recordsToUpsert = new ArrayList<>();
    for (HoodieRecord rec : records) {
        // remove older entry from expected partition path record key pairs
        expectedPartitionPathRecKeyPairs.remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
        String partitionPath = rec.getPartitionPath();
        String newPartitionPath = null;
        if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_SECOND_PARTITION_PATH;
        } else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_THIRD_PARTITION_PATH;
        } else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_FIRST_PARTITION_PATH;
        } else {
            throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath());
        }
        recordsToUpsert.add(new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), (HoodieRecordPayload) rec.getData()));
        // populate expected partition path and record keys
        expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey()));
    }
    writeRecords = jsc.parallelize(recordsToUpsert, 1);
    result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SparkSingleFileSortPlanStrategy(org.apache.hudi.client.clustering.plan.strategy.SparkSingleFileSortPlanStrategy) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) HoodieWriteHelper(org.apache.hudi.table.action.commit.HoodieWriteHelper) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Future(java.util.concurrent.Future) Map(java.util.Map) EAGER(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy.EAGER) Tag(org.junit.jupiter.api.Tag) HoodieWriteResult(org.apache.hudi.client.HoodieWriteResult) REQUESTED(org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) BaseHoodieWriteClient(org.apache.hudi.client.BaseHoodieWriteClient) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) VERSION_0(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion.VERSION_0) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) Executors(java.util.concurrent.Executors) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) SqlQuerySingleResultPreCommitValidator(org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator) DEFAULT_THIRD_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) Mockito.mock(org.mockito.Mockito.mock) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Dataset(org.apache.spark.sql.Dataset) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) DEFAULT_FIRST_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) StringUtils(org.apache.hudi.common.util.StringUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) Transformations.recordsToRecordKeySet(org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet) EXECUTION_STRATEGY_CLASS_NAME(org.apache.hudi.config.HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieData(org.apache.hudi.common.data.HoodieData) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) SqlQueryEqualityPreCommitValidator(org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator) DEFAULT_SECOND_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieCorruptedDataException(org.apache.hudi.exception.HoodieCorruptedDataException) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) COMPLETED(org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED) REPLACE_COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) CLEAN_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException) SparkSingleFileSortExecutionStrategy(org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy) HoodiePreCommitValidatorConfig(org.apache.hudi.config.HoodiePreCommitValidatorConfig) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) IOType(org.apache.hudi.common.model.IOType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) MarkerUtils(org.apache.hudi.common.util.MarkerUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) FileCreateUtils.getBaseFileCountsForPaths(org.apache.hudi.common.testutils.FileCreateUtils.getBaseFileCountsForPaths) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) ROLLBACK_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) NotNull(org.jetbrains.annotations.NotNull) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) Transformations.randomSelectAsHoodieKeys(org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys) INFLIGHT(org.apache.hudi.common.table.timeline.HoodieInstant.State.INFLIGHT) COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) ASYNC_CLUSTERING_ENABLE(org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE) TypedProperties(org.apache.hudi.common.config.TypedProperties) NULL_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.NULL_SCHEMA) Mockito.when(org.mockito.Mockito.when) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ClusteringTestUtils(org.apache.hudi.common.testutils.ClusteringTestUtils) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) Map(java.util.Map) HashMap(java.util.HashMap) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HashSet(java.util.HashSet)

Aggregations

IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Properties (java.util.Properties)1 Set (java.util.Set)1 UUID (java.util.UUID)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 Future (java.util.concurrent.Future)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 Path (org.apache.hadoop.fs.Path)1