Search in sources :

Example 6 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestCleaner method testKeepLatestFileVersions.

/**
 * Test Hudi COW Table Cleaner - Keep the latest file versions policy.
 */
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()).build();
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
    HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
    final String p0 = "2020/01/01";
    final String p1 = "2020/01/02";
    final Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
    // make 1 commit, with 1 file per partition
    final String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
    final String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
    c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100)));
    c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200)));
    testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), c1PartitionToFilesNameLengthMap, false, false);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
    assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next commit, with 1 insert & 1 update per partition
    final String file2P0C1 = UUID.randomUUID().toString();
    final String file2P1C1 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c2PartitionToFilesNameLengthMap = new HashMap<>();
    c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100)));
    c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200)));
    testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), c2PartitionToFilesNameLengthMap, false, false);
    // enableBootstrapSourceClean would delete the bootstrap base file at the same time
    List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, 1);
    HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0);
    assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
    if (enableBootstrapSourceClean) {
        HoodieFileStatus fstatus = bootstrapMapping.get(p0).get(0).getBootstrapFileStatus();
        // This ensures full path is recorded in metadata.
        assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
        assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p0).get(0).getBootstrapFileStatus().getPath().getUri())));
    }
    cleanStat = getCleanStat(hoodieCleanStatsTwo, p1);
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1));
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
    if (enableBootstrapSourceClean) {
        HoodieFileStatus fstatus = bootstrapMapping.get(p1).get(0).getBootstrapFileStatus();
        // This ensures full path is recorded in metadata.
        assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
        assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p1).get(0).getBootstrapFileStatus().getPath().getUri())));
    }
    // make next commit, with 2 updates to existing files, and 1 insert
    final String file3P0C2 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c3PartitionToFilesNameLengthMap = new HashMap<>();
    c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), Pair.of(file3P0C2, 100)));
    testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), c3PartitionToFilesNameLengthMap, false, false);
    List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config, 3);
    assertEquals(2, getCleanStat(hoodieCleanStatsThree, p0).getSuccessDeleteFiles().size(), "Must clean two files");
    assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
    assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
    // No cleaning on partially written file, with no commit.
    testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2);
    List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config);
    assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HashMap(java.util.HashMap) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestCleaner method testKeepLatestVersionsWithPendingCompactions.

/**
 * Test Keep Latest Versions when there are pending compactions.
 */
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testKeepLatestVersionsWithPendingCompactions(boolean retryFailure) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build();
    // Deletions:
    // . FileId Base Logs Total Retained Commits
    // FileId7 5 10 15 009, 011
    // FileId6 4 8 12 007, 009
    // FileId5 2 4 6 003 005
    // FileId4 1 2 3 001, 003
    // FileId3 0 0 0 000, 001
    // FileId2 0 0 0 000
    // FileId1 0 0 0 000
    testPendingCompactions(config, 36, 9, retryFailure);
}
Also used : HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 8 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestCleaner method testKeepLatestCommits.

/**
 * Test HoodieTable.clean() Cleaning by commit logic for COW table.
 */
@ParameterizedTest
@MethodSource("argumentsForTestKeepLatestCommits")
public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withIncrementalCleaningMode(enableIncrementalClean).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER).withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
    HoodieTestTable testTable = HoodieTestTable.of(metaClient);
    String p0 = "2020/01/01";
    String p1 = "2020/01/02";
    Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
    // make 1 commit, with 1 file per partition
    String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
    String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
    testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
    HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", Collections.unmodifiableMap(new HashMap<String, List<String>>() {

        {
            put(p0, CollectionUtils.createImmutableList(file1P0C0));
            put(p1, CollectionUtils.createImmutableList(file1P1C0));
        }
    }));
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    metaClient = HoodieTableMetaClient.reload(metaClient);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config, simulateFailureRetry);
    assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next commit, with 1 insert & 1 update per partition
    Map<String, String> partitionAndFileId002 = testTable.addInflightCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0, p1);
    String file2P0C1 = partitionAndFileId002.get(p0);
    String file2P1C1 = partitionAndFileId002.get(p1);
    testTable.forCommit("00000000000002").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
    commitMetadata = generateCommitMetadata("00000000000002", new HashMap<String, List<String>>() {

        {
            put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1));
            put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1));
        }
    });
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000002"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, simulateFailureRetry);
    assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next commit, with 2 updates to existing files, and 1 insert
    String file3P0C2 = testTable.addInflightCommit("00000000000003").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p0, file2P0C1).getFileIdsWithBaseFilesInPartitions(p0).get(p0);
    commitMetadata = generateCommitMetadata("00000000000003", CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file3P0C2)));
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000003"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config, simulateFailureRetry);
    assertEquals(0, hoodieCleanStatsThree.size(), "Must not clean any file. We have to keep 1 version before the latest commit time to keep");
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    // make next commit, with 2 updates to existing files, and 1 insert
    String file4P0C3 = testTable.addInflightCommit("00000000000004").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p0, file2P0C1).getFileIdsWithBaseFilesInPartitions(p0).get(p0);
    commitMetadata = generateCommitMetadata("00000000000004", CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file4P0C3)));
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000004"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config, simulateFailureRetry);
    // enableBootstrapSourceClean would delete the bootstrap base file as the same time
    HoodieCleanStat partitionCleanStat = getCleanStat(hoodieCleanStatsFour, p0);
    assertEquals(enableBootstrapSourceClean ? 2 : 1, partitionCleanStat.getSuccessDeleteFiles().size() + (partitionCleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : partitionCleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least one old file");
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file1P0C0));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
    assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));
    if (enableBootstrapSourceClean) {
        assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p0).get(0).getBootstrapFileStatus().getPath().getUri())));
    }
    // No cleaning on partially written file, with no commit.
    testTable.forCommit("00000000000005").withBaseFilesInPartition(p0, file3P0C2);
    commitMetadata = generateCommitMetadata("00000000000005", CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file3P0C2)));
    metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005"));
    metaClient.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    List<HoodieCleanStat> hoodieCleanStatsFive = runCleaner(config, simulateFailureRetry);
    HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsFive, p0);
    assertNull(cleanStat, "Must not clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000005", file3P0C2));
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HashMap(java.util.HashMap) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ArrayList(java.util.ArrayList) List(java.util.List) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 9 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestCleaner method testFailedInsertAndCleanByCommits.

/**
 * Test Helper for Cleaning failed commits by commits logic from HoodieWriteClient API perspective.
 *
 * @param insertFn Insert API to be tested
 * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
 *        record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
 * @throws Exception in case of errors
 */
private void testFailedInsertAndCleanByCommits(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, boolean isPreppedAPI) throws Exception {
    // keep upto 3 commits from the past
    int maxCommits = 3;
    HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withHeartbeatIntervalInMs(3000).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
    Pair<String, JavaRDD<WriteStatus>> result = insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    client.commit(result.getLeft(), result.getRight());
    HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient);
    assertTrue(table.getCompletedCleanTimeline().empty());
    insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    Pair<String, JavaRDD<WriteStatus>> ret = insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    // Await till enough time passes such that the last failed commits heartbeats are expired
    await().atMost(10, TimeUnit.SECONDS).until(() -> client.getHeartbeatClient().isHeartbeatExpired(ret.getLeft()));
    List<HoodieCleanStat> cleanStats = runCleaner(cfg);
    assertEquals(0, cleanStats.size(), "Must not clean any files");
    HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
    assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
    Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
    HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
    // Rollback of one of the failed writes should have deleted 3 files
    assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ArrayList(java.util.ArrayList) List(java.util.List)

Example 10 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestCleaner method testKeepLatestFileVersionsMOR.

/**
 * Test HoodieTable.clean() Cleaning by versions logic for MOR table with Log files.
 */
@Test
public void testKeepLatestFileVersionsMOR() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()).build();
    HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
    HoodieTestTable testTable = HoodieTestTable.of(metaClient);
    String p0 = "2020/01/01";
    // Make 3 files, one base file and 2 log files associated with base file
    String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0);
    testTable.forDeltaCommit("000").withLogFile(p0, file1P0, 1).withLogFile(p0, file1P0, 2);
    // Make 2 files, one base file and 1 log files associated with base file
    testTable.addDeltaCommit("001").withBaseFilesInPartition(p0, file1P0).withLogFile(p0, file1P0, 3);
    List<HoodieCleanStat> hoodieCleanStats = runCleaner(config);
    assertEquals(3, getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles().size(), "Must clean three files, one base and 2 log files");
    assertFalse(testTable.baseFileExists(p0, "000", file1P0));
    assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2));
    assertTrue(testTable.baseFileExists(p0, "001", file1P0));
    assertTrue(testTable.logFileExists(p0, "001", file1P0, 3));
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)327 Test (org.junit.jupiter.api.Test)179 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)173 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)169 ArrayList (java.util.ArrayList)136 List (java.util.List)133 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)126 HoodieTable (org.apache.hudi.table.HoodieTable)117 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)111 HashMap (java.util.HashMap)93 Path (org.apache.hadoop.fs.Path)92 WriteStatus (org.apache.hudi.client.WriteStatus)86 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)84 Collectors (java.util.stream.Collectors)81 Map (java.util.Map)76 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)76 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)74 Arrays (java.util.Arrays)73 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)72 Option (org.apache.hudi.common.util.Option)69