Search in sources :

Example 1 with HoodieCleanStat

use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.

the class TestMetadataConversionUtils method createCleanMetadata.

private void createCleanMetadata(String instantTime) throws IOException {
    HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
    HoodieCleanStat cleanStats = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), instantTime);
    HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
    HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata);
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) Random(java.util.Random) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan)

Example 2 with HoodieCleanStat

use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.

the class TestCleaner method testPendingCompactions.

/**
 * Common test method for validating pending compactions.
 *
 * @param config Hoodie Write Config
 * @param expNumFilesDeleted Number of files deleted
 */
private void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted, int expNumFilesUnderCompactionDeleted, boolean retryFailure) throws Exception {
    HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
    final String partition = "2016/03/15";
    Map<String, String> expFileIdToPendingCompaction = new HashMap<String, String>() {

        {
            put("fileId2", "004");
            put("fileId3", "006");
            put("fileId4", "008");
            put("fileId5", "010");
        }
    };
    Map<String, String> fileIdToLatestInstantBeforeCompaction = new HashMap<String, String>() {

        {
            put("fileId1", "000");
            put("fileId2", "000");
            put("fileId3", "001");
            put("fileId4", "003");
            put("fileId5", "005");
            put("fileId6", "009");
            put("fileId7", "011");
        }
    };
    // Generate 7 file-groups. First one has only one slice and no pending compaction. File Slices (2 - 5) has
    // multiple versions with pending compaction. File Slices (6 - 7) have multiple file-slices but not under
    // compactions
    // FileIds 2-5 will be under compaction
    HoodieTestTable.of(metaClient).addCommit("000").withBaseFilesInPartition(partition, "fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId1", 1, 2).withLogFile(partition, "fileId2", 1, 2).withLogFile(partition, "fileId3", 1, 2).withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addCommit("001").withBaseFilesInPartition(partition, "fileId3", "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId3", 1, 2).withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addCommit("003").withBaseFilesInPartition(partition, "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("004", new FileSlice(partition, "000", "fileId2")).withLogFile(partition, "fileId2", 1, 2).addCommit("005").withBaseFilesInPartition(partition, "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("006", new FileSlice(partition, "001", "fileId3")).withLogFile(partition, "fileId3", 1, 2).addCommit("007").withBaseFilesInPartition(partition, "fileId6", "fileId7").withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("008", new FileSlice(partition, "003", "fileId4")).withLogFile(partition, "fileId4", 1, 2).addCommit("009").withBaseFilesInPartition(partition, "fileId6", "fileId7").withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("010", new FileSlice(partition, "005", "fileId5")).withLogFile(partition, "fileId5", 1, 2).addCommit("011").withBaseFilesInPartition(partition, "fileId7").withLogFile(partition, "fileId7", 1, 2).addCommit("013");
    // Clean now
    metaClient = HoodieTableMetaClient.reload(metaClient);
    List<HoodieCleanStat> hoodieCleanStats = runCleaner(config, retryFailure);
    // Test for safety
    final HoodieTableMetaClient newMetaClient = HoodieTableMetaClient.reload(metaClient);
    final HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    expFileIdToPendingCompaction.forEach((fileId, value) -> {
        String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
        Option<FileSlice> fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partition, baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst());
        assertTrue(fileSliceForCompaction.isPresent(), "Base Instant for Compaction must be preserved");
        assertTrue(fileSliceForCompaction.get().getBaseFile().isPresent(), "FileSlice has data-file");
        assertEquals(2, fileSliceForCompaction.get().getLogFiles().count(), "FileSlice has log-files");
    });
    // Test for progress (Did we clean some files ?)
    long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map(fileIdWithCommitTime -> {
        if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
            assertTrue(HoodieTimeline.compareTimestamps(fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()), HoodieTimeline.GREATER_THAN, fileIdWithCommitTime.getValue()), "Deleted instant time must be less than pending compaction");
            return true;
        }
        return false;
    })).filter(x -> x).count();
    long numDeleted = hoodieCleanStats.stream().mapToLong(cleanStat -> cleanStat.getDeletePathPatterns().size()).sum();
    // Tighter check for regression
    assertEquals(expNumFilesDeleted, numDeleted, "Correct number of files deleted");
    assertEquals(expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted, "Correct number of files under compaction deleted");
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Arrays(java.util.Arrays) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanPlanner(org.apache.hudi.table.action.clean.CleanPlanner) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Awaitility.await(org.awaitility.Awaitility.await) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) StandardCharsets(java.nio.charset.StandardCharsets) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) CleanPlanMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) CleanPlanV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) StringUtils(org.apache.hudi.common.util.StringUtils) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieTestTable.makeIncrementalCommitTimes(org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) IOType(org.apache.hudi.common.model.IOType) Predicate(java.util.function.Predicate) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Tuple3(scala.Tuple3) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) Test(org.junit.jupiter.api.Test) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) CleanMetadataMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) DEFAULT_PARTITION_PATHS(org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) TestBootstrapIndex(org.apache.hudi.common.bootstrap.TestBootstrapIndex) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTestTable.makeNewCommitTime(org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkHoodieIndexFactory(org.apache.hudi.index.SparkHoodieIndexFactory) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice)

Example 3 with HoodieCleanStat

use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.

the class TestCleaner method testKeepLatestCommitsMOR.

/**
 * Test HoodieTable.clean() Cleaning by commit logic for MOR table with Log files.
 */
@Test
public void testKeepLatestCommitsMOR() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()).build();
    HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
    HoodieTestTable testTable = HoodieTestTable.of(metaClient);
    String p0 = "2020/01/01";
    // Make 3 files, one base file and 2 log files associated with base file
    String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0);
    testTable.forDeltaCommit("000").withLogFile(p0, file1P0, 1).withLogFile(p0, file1P0, 2);
    // Make 2 files, one base file and 1 log files associated with base file
    testTable.addDeltaCommit("001").withBaseFilesInPartition(p0, file1P0).withLogFile(p0, file1P0, 3);
    // Make 2 files, one base file and 1 log files associated with base file
    testTable.addDeltaCommit("002").withBaseFilesInPartition(p0, file1P0).withLogFile(p0, file1P0, 4);
    List<HoodieCleanStat> hoodieCleanStats = runCleaner(config);
    assertEquals(3, getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles().size(), "Must clean three files, one base and 2 log files");
    assertFalse(testTable.baseFileExists(p0, "000", file1P0));
    assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2));
    assertTrue(testTable.baseFileExists(p0, "001", file1P0));
    assertTrue(testTable.logFileExists(p0, "001", file1P0, 3));
    assertTrue(testTable.baseFileExists(p0, "002", file1P0));
    assertTrue(testTable.logFileExists(p0, "002", file1P0, 4));
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieCleanStat

use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.

the class TestCleaner method testCleaningWithZeroPartitionPaths.

/**
 * Test CLeaner Stat when there are no partition paths.
 */
@Test
public void testCleaningWithZeroPartitionPaths() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
    // Make a commit, although there are no partitionPaths.
    // Example use-case of this is when a client wants to create a table
    // with just some commit metadata, but no data/partitionPaths.
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
    HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
    testTable.doWriteOperation("001", WriteOperationType.INSERT, Collections.emptyList(), 1);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
    assertTrue(hoodieCleanStatsOne.isEmpty(), "HoodieCleanStats should be empty for a table with empty partitionPaths");
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 5 with HoodieCleanStat

use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.

the class TestCleaner method testKeepLatestFileVersions.

/**
 * Test Hudi COW Table Cleaner - Keep the latest file versions policy.
 */
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()).build();
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
    HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
    final String p0 = "2020/01/01";
    final String p1 = "2020/01/02";
    final Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
    // make 1 commit, with 1 file per partition
    final String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
    final String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
    c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100)));
    c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200)));
    testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), c1PartitionToFilesNameLengthMap, false, false);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
    assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next commit, with 1 insert & 1 update per partition
    final String file2P0C1 = UUID.randomUUID().toString();
    final String file2P1C1 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c2PartitionToFilesNameLengthMap = new HashMap<>();
    c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100)));
    c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200)));
    testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), c2PartitionToFilesNameLengthMap, false, false);
    // enableBootstrapSourceClean would delete the bootstrap base file at the same time
    List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, 1);
    HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0);
    assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
    if (enableBootstrapSourceClean) {
        HoodieFileStatus fstatus = bootstrapMapping.get(p0).get(0).getBootstrapFileStatus();
        // This ensures full path is recorded in metadata.
        assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
        assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p0).get(0).getBootstrapFileStatus().getPath().getUri())));
    }
    cleanStat = getCleanStat(hoodieCleanStatsTwo, p1);
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1));
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
    if (enableBootstrapSourceClean) {
        HoodieFileStatus fstatus = bootstrapMapping.get(p1).get(0).getBootstrapFileStatus();
        // This ensures full path is recorded in metadata.
        assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
        assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p1).get(0).getBootstrapFileStatus().getPath().getUri())));
    }
    // make next commit, with 2 updates to existing files, and 1 insert
    final String file3P0C2 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c3PartitionToFilesNameLengthMap = new HashMap<>();
    c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), Pair.of(file3P0C2, 100)));
    testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), c3PartitionToFilesNameLengthMap, false, false);
    List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config, 3);
    assertEquals(2, getCleanStat(hoodieCleanStatsThree, p0).getSuccessDeleteFiles().size(), "Must clean two files");
    assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
    assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
    // No cleaning on partially written file, with no commit.
    testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2);
    List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config);
    assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HashMap(java.util.HashMap) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieCleanStat (org.apache.hudi.common.HoodieCleanStat)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)14 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)12 HashMap (java.util.HashMap)11 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)11 ArrayList (java.util.ArrayList)10 List (java.util.List)10 HoodieCleanMetadata (org.apache.hudi.avro.model.HoodieCleanMetadata)10 Test (org.junit.jupiter.api.Test)10 HoodieActionInstant (org.apache.hudi.avro.model.HoodieActionInstant)7 HoodieCleanerPlan (org.apache.hudi.avro.model.HoodieCleanerPlan)7 HoodieTestTable (org.apache.hudi.common.testutils.HoodieTestTable)7 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)6 IOException (java.io.IOException)5 Map (java.util.Map)5 HoodieRollbackMetadata (org.apache.hudi.avro.model.HoodieRollbackMetadata)5 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)5 HoodieIOException (org.apache.hudi.exception.HoodieIOException)5 Path (org.apache.hadoop.fs.Path)4 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)4