Search in sources :

Example 1 with DEFAULT_SECOND_PARTITION_PATH

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsUpdatePartitionPath.

/**
 * This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition
 * compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted
 * in the new partition.
 * test structure:
 * 1. insert 1 batch
 * 2. insert 2nd batch with larger no of records so that a new file group is created for partitions
 * 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new
 * records are upserted to the new partition
 *
 * @param indexType index type to be tested for
 * @param config instance of {@link HoodieWriteConfig} to use
 * @param writeFn write function to be used for testing
 */
private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
    // instantiate client
    HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withBloomIndexUpdatePartitionPath(true).withGlobalSimpleIndexUpdatePartitionPath(true).build()).withTimelineLayoutVersion(VERSION_0).build();
    HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
    // Set rollback to LAZY so no inflights are deleted
    hoodieWriteConfig.getProps().put(HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), HoodieFailedWritesCleaningPolicy.LAZY.name());
    SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
    // Write 1
    String newCommitTime = "001";
    int numRecords = 10;
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
    Set<Pair<String, String>> expectedPartitionPathRecKeyPairs = new HashSet<>();
    // populate expected partition path and record keys
    for (HoodieRecord rec : records) {
        expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
    }
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    String[] fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
    // verify one basefile per partition
    String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new));
    Map<String, Long> baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths);
    for (Map.Entry<String, Long> entry : baseFileCounts.entrySet()) {
        assertEquals(1, entry.getValue());
    }
    assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1));
    // Write 2
    newCommitTime = "002";
    // so that a new file id is created
    numRecords = 20;
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords);
    // populate expected partition path and record keys
    for (HoodieRecord rec : recordsSecondBatch) {
        expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
    }
    writeRecords = jsc.parallelize(recordsSecondBatch, 1);
    result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
    // verify that there are more than 1 basefiles per partition
    // we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile.
    baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths);
    assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, "At least one partition should have more than 1 base file after 2nd batch of writes");
    // Write 3 (upserts to records from batch 1 with diff partition path)
    newCommitTime = "003";
    // update to diff partition paths
    List<HoodieRecord> recordsToUpsert = new ArrayList<>();
    for (HoodieRecord rec : records) {
        // remove older entry from expected partition path record key pairs
        expectedPartitionPathRecKeyPairs.remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
        String partitionPath = rec.getPartitionPath();
        String newPartitionPath = null;
        if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_SECOND_PARTITION_PATH;
        } else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_THIRD_PARTITION_PATH;
        } else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) {
            newPartitionPath = DEFAULT_FIRST_PARTITION_PATH;
        } else {
            throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath());
        }
        recordsToUpsert.add(new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), (HoodieRecordPayload) rec.getData()));
        // populate expected partition path and record keys
        expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey()));
    }
    writeRecords = jsc.parallelize(recordsToUpsert, 1);
    result = writeFn.apply(client, writeRecords, newCommitTime);
    result.collect();
    // Check the entire dataset has all records
    fullPartitionPaths = getFullPartitionPaths();
    assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SparkSingleFileSortPlanStrategy(org.apache.hudi.client.clustering.plan.strategy.SparkSingleFileSortPlanStrategy) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) HoodieWriteHelper(org.apache.hudi.table.action.commit.HoodieWriteHelper) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Future(java.util.concurrent.Future) Map(java.util.Map) EAGER(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy.EAGER) Tag(org.junit.jupiter.api.Tag) HoodieWriteResult(org.apache.hudi.client.HoodieWriteResult) REQUESTED(org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) BaseHoodieWriteClient(org.apache.hudi.client.BaseHoodieWriteClient) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) VERSION_0(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion.VERSION_0) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) Executors(java.util.concurrent.Executors) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) SqlQuerySingleResultPreCommitValidator(org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator) DEFAULT_THIRD_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH) Mockito.mock(org.mockito.Mockito.mock) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Dataset(org.apache.spark.sql.Dataset) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) DEFAULT_FIRST_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) StringUtils(org.apache.hudi.common.util.StringUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) Transformations.recordsToRecordKeySet(org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet) EXECUTION_STRATEGY_CLASS_NAME(org.apache.hudi.config.HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieData(org.apache.hudi.common.data.HoodieData) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) SqlQueryEqualityPreCommitValidator(org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator) DEFAULT_SECOND_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieCorruptedDataException(org.apache.hudi.exception.HoodieCorruptedDataException) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) COMPLETED(org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED) REPLACE_COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) CLEAN_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException) SparkSingleFileSortExecutionStrategy(org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy) HoodiePreCommitValidatorConfig(org.apache.hudi.config.HoodiePreCommitValidatorConfig) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) IOType(org.apache.hudi.common.model.IOType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) MarkerUtils(org.apache.hudi.common.util.MarkerUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) FileCreateUtils.getBaseFileCountsForPaths(org.apache.hudi.common.testutils.FileCreateUtils.getBaseFileCountsForPaths) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) ROLLBACK_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) NotNull(org.jetbrains.annotations.NotNull) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) Transformations.randomSelectAsHoodieKeys(org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys) INFLIGHT(org.apache.hudi.common.table.timeline.HoodieInstant.State.INFLIGHT) COMMIT_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) ASYNC_CLUSTERING_ENABLE(org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE) TypedProperties(org.apache.hudi.common.config.TypedProperties) NULL_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.NULL_SCHEMA) Mockito.when(org.mockito.Mockito.when) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ClusteringTestUtils(org.apache.hudi.common.testutils.ClusteringTestUtils) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) Map(java.util.Map) HashMap(java.util.HashMap) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HashSet(java.util.HashSet)

Example 2 with DEFAULT_SECOND_PARTITION_PATH

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH in project hudi by apache.

the class HoodieClientRollbackTestBase method insertOverwriteCommitDataWithTwoPartitions.

protected void insertOverwriteCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondInsertOverwrite) throws IOException {
    // just generate two partitions
    dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
    HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    /**
     * Write 1 (upsert)
     */
    String newCommitTime = "001";
    List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    client.startCommitWithTime(newCommitTime);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    // get fileIds written
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
    List<HoodieFileGroup> firstPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, firstPartitionCommit1FileGroups.size());
    Set<String> partition1Commit1FileIds = firstPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet());
    List<HoodieFileGroup> secondPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, secondPartitionCommit1FileGroups.size());
    Set<String> partition2Commit1FileIds = secondPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet());
    /**
     * Write 2 (one insert_overwrite)
     */
    String commitActionType = HoodieTimeline.REPLACE_COMMIT_ACTION;
    newCommitTime = "002";
    records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    writeRecords = jsc.parallelize(records, 1);
    client.startCommitWithTime(newCommitTime, commitActionType);
    HoodieWriteResult result = client.insertOverwrite(writeRecords, newCommitTime);
    statuses = result.getWriteStatuses();
    Assertions.assertNoWriteErrors(statuses.collect());
    if (commitSecondInsertOverwrite) {
        client.commit(newCommitTime, statuses, Option.empty(), commitActionType, result.getPartitionToReplaceFileIds());
    }
    metaClient.reloadActiveTimeline();
    // get new fileIds written as part of insert_overwrite
    fsView = getFileSystemViewWithUnCommittedSlices(metaClient);
    List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).filter(fg -> !partition1Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList());
    firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).filter(fg -> !partition2Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList());
    secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    assertEquals(1, firstPartitionCommit2FileSlices.size());
    assertEquals(1, secondPartitionCommit2FileSlices.size());
}
Also used : SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) DEFAULT_SECOND_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Set(java.util.Set) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) DEFAULT_FIRST_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions(org.apache.hudi.testutils.Assertions) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieWriteResult(org.apache.hudi.client.HoodieWriteResult) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteResult(org.apache.hudi.client.HoodieWriteResult) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 3 with DEFAULT_SECOND_PARTITION_PATH

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH in project hudi by apache.

the class TestMergeOnReadRollbackActionExecutor method testRollbackForCanIndexLogFile.

@Test
public void testRollbackForCanIndexLogFile() throws IOException {
    cleanupResources();
    setUpDFS();
    // 1. prepare data and assert data result
    // just generate one partitions
    dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH });
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2).withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).withWriteStatusClass(MetadataMergeWriteStatus.class).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()).forTable("test-trip-table").withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withEnableBackupForRemoteFileSystemView(// Fail test if problem connecting to timeline-server
    false).withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()).withRollbackUsingMarkers(false).withAutoCommit(false).build();
    // 1. prepare data
    new HoodieTestDataGenerator().writePartitionMetadata(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH }, basePath);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    // Write 1 (only inserts)
    String newCommitTime = "001";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    org.apache.hudi.testutils.Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    // check fileSlice
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
    List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, firstPartitionCommit2FileGroups.size());
    assertEquals(1, (int) firstPartitionCommit2FileGroups.get(0).getAllFileSlices().count());
    assertFalse(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getBaseFile().isPresent());
    assertEquals(1, firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getLogFiles().count());
    String generatedFileID = firstPartitionCommit2FileGroups.get(0).getFileGroupId().getFileId();
    // check hoodieCommitMeta
    HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "001")).get(), HoodieCommitMetadata.class);
    List<HoodieWriteStat> firstPartitionWriteStat = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH);
    assertEquals(2, firstPartitionWriteStat.size());
    // we have an empty writeStat for all partition
    assert firstPartitionWriteStat.stream().anyMatch(wStat -> StringUtils.isNullOrEmpty(wStat.getFileId()));
    // we have one  non-empty writeStat which must contains update or insert
    assertEquals(1, firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).count());
    firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).forEach(wStat -> {
        assert wStat.getNumInserts() > 0;
    });
    // Write 2 (inserts)
    newCommitTime = "002";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> updateRecords = Collections.singletonList(dataGen.generateUpdateRecord(records.get(0).getKey(), newCommitTime));
    List<HoodieRecord> insertRecordsInSamePartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH);
    List<HoodieRecord> insertRecordsInOtherPartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_SECOND_PARTITION_PATH);
    List<HoodieRecord> recordsToBeWrite = Stream.concat(Stream.concat(updateRecords.stream(), insertRecordsInSamePartition.stream()), insertRecordsInOtherPartition.stream()).collect(Collectors.toList());
    writeRecords = jsc.parallelize(recordsToBeWrite, 1);
    statuses = client.upsert(writeRecords, newCommitTime);
    client.commit(newCommitTime, statuses);
    table = this.getHoodieTable(metaClient, cfg);
    commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime)).get(), HoodieCommitMetadata.class);
    assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_FIRST_PARTITION_PATH);
    assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_SECOND_PARTITION_PATH);
    List<HoodieWriteStat> hoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH);
    // Both update and insert record should enter same existing fileGroup due to small file handling
    assertEquals(1, hoodieWriteStatOptionList.size());
    assertEquals(generatedFileID, hoodieWriteStatOptionList.get(0).getFileId());
    // check insert and update numbers
    assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts());
    assertEquals(1, hoodieWriteStatOptionList.get(0).getNumUpdateWrites());
    List<HoodieWriteStat> secondHoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_SECOND_PARTITION_PATH);
    // All insert should enter one fileGroup
    assertEquals(1, secondHoodieWriteStatOptionList.size());
    String fileIdInPartitionTwo = secondHoodieWriteStatOptionList.get(0).getFileId();
    assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts());
    // Rollback
    HoodieInstant rollBackInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002");
    BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false, cfg.shouldRollbackUsingMarkers());
    mergeOnReadRollbackPlanActionExecutor.execute().get();
    MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor(context, cfg, table, "003", rollBackInstant, true, false);
    // 3. assert the rollback stat
    Map<String, HoodieRollbackPartitionMetadata> rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata();
    assertEquals(2, rollbackMetadata.size());
    // 4. assert filegroup after rollback, and compare to the rollbackstat
    // assert the first partition data and log file size
    HoodieRollbackPartitionMetadata partitionMetadata = rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH);
    assertTrue(partitionMetadata.getSuccessDeleteFiles().isEmpty());
    assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty());
    assertEquals(1, partitionMetadata.getRollbackLogFiles().size());
    // assert the second partition data and log file size
    partitionMetadata = rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH);
    assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size());
    assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty());
    assertTrue(partitionMetadata.getRollbackLogFiles().isEmpty());
    assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size());
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) DEFAULT_FIRST_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) ArrayList(java.util.ArrayList) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) HoodieRollbackPartitionMetadata(org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) DEFAULT_SECOND_PARTITION_PATH(org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) Collections(java.util.Collections) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRollbackPartitionMetadata(org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

IOException (java.io.IOException)3 List (java.util.List)3 Collectors (java.util.stream.Collectors)3 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)3 WriteStatus (org.apache.hudi.client.WriteStatus)3 FileSlice (org.apache.hudi.common.model.FileSlice)3 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)3 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)3 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)3 DEFAULT_FIRST_PARTITION_PATH (org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)3 DEFAULT_SECOND_PARTITION_PATH (org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 HoodieTable (org.apache.hudi.table.HoodieTable)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)3 ArrayList (java.util.ArrayList)2 Collections (java.util.Collections)2 Map (java.util.Map)2 Set (java.util.Set)2 Stream (java.util.stream.Stream)2