use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.
the class TestMetadataConversionUtils method createCleanMetadata.
private void createCleanMetadata(String instantTime) throws IOException {
HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
HoodieCleanStat cleanStats = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), instantTime);
HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata);
}
use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.
the class TestCleaner method testPendingCompactions.
/**
* Common test method for validating pending compactions.
*
* @param config Hoodie Write Config
* @param expNumFilesDeleted Number of files deleted
*/
private void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted, int expNumFilesUnderCompactionDeleted, boolean retryFailure) throws Exception {
HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
final String partition = "2016/03/15";
Map<String, String> expFileIdToPendingCompaction = new HashMap<String, String>() {
{
put("fileId2", "004");
put("fileId3", "006");
put("fileId4", "008");
put("fileId5", "010");
}
};
Map<String, String> fileIdToLatestInstantBeforeCompaction = new HashMap<String, String>() {
{
put("fileId1", "000");
put("fileId2", "000");
put("fileId3", "001");
put("fileId4", "003");
put("fileId5", "005");
put("fileId6", "009");
put("fileId7", "011");
}
};
// Generate 7 file-groups. First one has only one slice and no pending compaction. File Slices (2 - 5) has
// multiple versions with pending compaction. File Slices (6 - 7) have multiple file-slices but not under
// compactions
// FileIds 2-5 will be under compaction
HoodieTestTable.of(metaClient).addCommit("000").withBaseFilesInPartition(partition, "fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId1", 1, 2).withLogFile(partition, "fileId2", 1, 2).withLogFile(partition, "fileId3", 1, 2).withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addCommit("001").withBaseFilesInPartition(partition, "fileId3", "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId3", 1, 2).withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addCommit("003").withBaseFilesInPartition(partition, "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("004", new FileSlice(partition, "000", "fileId2")).withLogFile(partition, "fileId2", 1, 2).addCommit("005").withBaseFilesInPartition(partition, "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("006", new FileSlice(partition, "001", "fileId3")).withLogFile(partition, "fileId3", 1, 2).addCommit("007").withBaseFilesInPartition(partition, "fileId6", "fileId7").withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("008", new FileSlice(partition, "003", "fileId4")).withLogFile(partition, "fileId4", 1, 2).addCommit("009").withBaseFilesInPartition(partition, "fileId6", "fileId7").withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("010", new FileSlice(partition, "005", "fileId5")).withLogFile(partition, "fileId5", 1, 2).addCommit("011").withBaseFilesInPartition(partition, "fileId7").withLogFile(partition, "fileId7", 1, 2).addCommit("013");
// Clean now
metaClient = HoodieTableMetaClient.reload(metaClient);
List<HoodieCleanStat> hoodieCleanStats = runCleaner(config, retryFailure);
// Test for safety
final HoodieTableMetaClient newMetaClient = HoodieTableMetaClient.reload(metaClient);
final HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
expFileIdToPendingCompaction.forEach((fileId, value) -> {
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
Option<FileSlice> fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partition, baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst());
assertTrue(fileSliceForCompaction.isPresent(), "Base Instant for Compaction must be preserved");
assertTrue(fileSliceForCompaction.get().getBaseFile().isPresent(), "FileSlice has data-file");
assertEquals(2, fileSliceForCompaction.get().getLogFiles().count(), "FileSlice has log-files");
});
// Test for progress (Did we clean some files ?)
long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map(fileIdWithCommitTime -> {
if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
assertTrue(HoodieTimeline.compareTimestamps(fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()), HoodieTimeline.GREATER_THAN, fileIdWithCommitTime.getValue()), "Deleted instant time must be less than pending compaction");
return true;
}
return false;
})).filter(x -> x).count();
long numDeleted = hoodieCleanStats.stream().mapToLong(cleanStat -> cleanStat.getDeletePathPatterns().size()).sum();
// Tighter check for regression
assertEquals(expNumFilesDeleted, numDeleted, "Correct number of files deleted");
assertEquals(expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted, "Correct number of files under compaction deleted");
}
use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.
the class TestCleaner method testKeepLatestCommitsMOR.
/**
* Test HoodieTable.clean() Cleaning by commit logic for MOR table with Log files.
*/
@Test
public void testKeepLatestCommitsMOR() throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()).build();
HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
HoodieTestTable testTable = HoodieTestTable.of(metaClient);
String p0 = "2020/01/01";
// Make 3 files, one base file and 2 log files associated with base file
String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0);
testTable.forDeltaCommit("000").withLogFile(p0, file1P0, 1).withLogFile(p0, file1P0, 2);
// Make 2 files, one base file and 1 log files associated with base file
testTable.addDeltaCommit("001").withBaseFilesInPartition(p0, file1P0).withLogFile(p0, file1P0, 3);
// Make 2 files, one base file and 1 log files associated with base file
testTable.addDeltaCommit("002").withBaseFilesInPartition(p0, file1P0).withLogFile(p0, file1P0, 4);
List<HoodieCleanStat> hoodieCleanStats = runCleaner(config);
assertEquals(3, getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles().size(), "Must clean three files, one base and 2 log files");
assertFalse(testTable.baseFileExists(p0, "000", file1P0));
assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2));
assertTrue(testTable.baseFileExists(p0, "001", file1P0));
assertTrue(testTable.logFileExists(p0, "001", file1P0, 3));
assertTrue(testTable.baseFileExists(p0, "002", file1P0));
assertTrue(testTable.logFileExists(p0, "002", file1P0, 4));
}
use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.
the class TestCleaner method testCleaningWithZeroPartitionPaths.
/**
* Test CLeaner Stat when there are no partition paths.
*/
@Test
public void testCleaningWithZeroPartitionPaths() throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
// Make a commit, although there are no partitionPaths.
// Example use-case of this is when a client wants to create a table
// with just some commit metadata, but no data/partitionPaths.
HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
testTable.doWriteOperation("001", WriteOperationType.INSERT, Collections.emptyList(), 1);
metaClient = HoodieTableMetaClient.reload(metaClient);
List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
assertTrue(hoodieCleanStatsOne.isEmpty(), "HoodieCleanStats should be empty for a table with empty partitionPaths");
}
use of org.apache.hudi.common.HoodieCleanStat in project hudi by apache.
the class TestCleaner method testKeepLatestFileVersions.
/**
* Test Hudi COW Table Cleaner - Keep the latest file versions policy.
*/
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()).build();
HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
final String p0 = "2020/01/01";
final String p1 = "2020/01/02";
final Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
// make 1 commit, with 1 file per partition
final String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
final String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100)));
c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200)));
testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), c1PartitionToFilesNameLengthMap, false, false);
List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next commit, with 1 insert & 1 update per partition
final String file2P0C1 = UUID.randomUUID().toString();
final String file2P1C1 = UUID.randomUUID().toString();
Map<String, List<Pair<String, Integer>>> c2PartitionToFilesNameLengthMap = new HashMap<>();
c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100)));
c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200)));
testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), c2PartitionToFilesNameLengthMap, false, false);
// enableBootstrapSourceClean would delete the bootstrap base file at the same time
List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, 1);
HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0);
assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
if (enableBootstrapSourceClean) {
HoodieFileStatus fstatus = bootstrapMapping.get(p0).get(0).getBootstrapFileStatus();
// This ensures full path is recorded in metadata.
assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p0).get(0).getBootstrapFileStatus().getPath().getUri())));
}
cleanStat = getCleanStat(hoodieCleanStatsTwo, p1);
assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1));
assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
if (enableBootstrapSourceClean) {
HoodieFileStatus fstatus = bootstrapMapping.get(p1).get(0).getBootstrapFileStatus();
// This ensures full path is recorded in metadata.
assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p1).get(0).getBootstrapFileStatus().getPath().getUri())));
}
// make next commit, with 2 updates to existing files, and 1 insert
final String file3P0C2 = UUID.randomUUID().toString();
Map<String, List<Pair<String, Integer>>> c3PartitionToFilesNameLengthMap = new HashMap<>();
c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), Pair.of(file3P0C2, 100)));
testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), c3PartitionToFilesNameLengthMap, false, false);
List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config, 3);
assertEquals(2, getCleanStat(hoodieCleanStatsThree, p0).getSuccessDeleteFiles().size(), "Must clean two files");
assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
// No cleaning on partially written file, with no commit.
testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2);
List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config);
assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
}
Aggregations