use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestCleaner method testKeepLatestFileVersions.
/**
* Test Hudi COW Table Cleaner - Keep the latest file versions policy.
*/
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()).build();
HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
final String p0 = "2020/01/01";
final String p1 = "2020/01/02";
final Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
// make 1 commit, with 1 file per partition
final String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
final String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100)));
c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200)));
testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), c1PartitionToFilesNameLengthMap, false, false);
List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next commit, with 1 insert & 1 update per partition
final String file2P0C1 = UUID.randomUUID().toString();
final String file2P1C1 = UUID.randomUUID().toString();
Map<String, List<Pair<String, Integer>>> c2PartitionToFilesNameLengthMap = new HashMap<>();
c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100)));
c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200)));
testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), c2PartitionToFilesNameLengthMap, false, false);
// enableBootstrapSourceClean would delete the bootstrap base file at the same time
List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, 1);
HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0);
assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
if (enableBootstrapSourceClean) {
HoodieFileStatus fstatus = bootstrapMapping.get(p0).get(0).getBootstrapFileStatus();
// This ensures full path is recorded in metadata.
assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p0).get(0).getBootstrapFileStatus().getPath().getUri())));
}
cleanStat = getCleanStat(hoodieCleanStatsTwo, p1);
assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1));
assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file");
if (enableBootstrapSourceClean) {
HoodieFileStatus fstatus = bootstrapMapping.get(p1).get(0).getBootstrapFileStatus();
// This ensures full path is recorded in metadata.
assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + " but did not contain " + fstatus.getPath().getUri());
assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p1).get(0).getBootstrapFileStatus().getPath().getUri())));
}
// make next commit, with 2 updates to existing files, and 1 insert
final String file3P0C2 = UUID.randomUUID().toString();
Map<String, List<Pair<String, Integer>>> c3PartitionToFilesNameLengthMap = new HashMap<>();
c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), Pair.of(file3P0C2, 100)));
testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), c3PartitionToFilesNameLengthMap, false, false);
List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config, 3);
assertEquals(2, getCleanStat(hoodieCleanStatsThree, p0).getSuccessDeleteFiles().size(), "Must clean two files");
assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
// No cleaning on partially written file, with no commit.
testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2);
List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config);
assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestCleaner method testKeepLatestVersionsWithPendingCompactions.
/**
* Test Keep Latest Versions when there are pending compactions.
*/
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testKeepLatestVersionsWithPendingCompactions(boolean retryFailure) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build();
// Deletions:
// . FileId Base Logs Total Retained Commits
// FileId7 5 10 15 009, 011
// FileId6 4 8 12 007, 009
// FileId5 2 4 6 003 005
// FileId4 1 2 3 001, 003
// FileId3 0 0 0 000, 001
// FileId2 0 0 0 000
// FileId1 0 0 0 000
testPendingCompactions(config, 36, 9, retryFailure);
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestCleaner method testKeepLatestCommits.
/**
* Test HoodieTable.clean() Cleaning by commit logic for COW table.
*/
@ParameterizedTest
@MethodSource("argumentsForTestKeepLatestCommits")
public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withIncrementalCleaningMode(enableIncrementalClean).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER).withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
HoodieTestTable testTable = HoodieTestTable.of(metaClient);
String p0 = "2020/01/01";
String p1 = "2020/01/02";
Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
// make 1 commit, with 1 file per partition
String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", Collections.unmodifiableMap(new HashMap<String, List<String>>() {
{
put(p0, CollectionUtils.createImmutableList(file1P0C0));
put(p1, CollectionUtils.createImmutableList(file1P1C0));
}
}));
metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
metaClient = HoodieTableMetaClient.reload(metaClient);
List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config, simulateFailureRetry);
assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next commit, with 1 insert & 1 update per partition
Map<String, String> partitionAndFileId002 = testTable.addInflightCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0, p1);
String file2P0C1 = partitionAndFileId002.get(p0);
String file2P1C1 = partitionAndFileId002.get(p1);
testTable.forCommit("00000000000002").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
commitMetadata = generateCommitMetadata("00000000000002", new HashMap<String, List<String>>() {
{
put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1));
put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1));
}
});
metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000002"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, simulateFailureRetry);
assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1));
assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
// make next commit, with 2 updates to existing files, and 1 insert
String file3P0C2 = testTable.addInflightCommit("00000000000003").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p0, file2P0C1).getFileIdsWithBaseFilesInPartitions(p0).get(p0);
commitMetadata = generateCommitMetadata("00000000000003", CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file3P0C2)));
metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000003"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config, simulateFailureRetry);
assertEquals(0, hoodieCleanStatsThree.size(), "Must not clean any file. We have to keep 1 version before the latest commit time to keep");
assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
// make next commit, with 2 updates to existing files, and 1 insert
String file4P0C3 = testTable.addInflightCommit("00000000000004").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p0, file2P0C1).getFileIdsWithBaseFilesInPartitions(p0).get(p0);
commitMetadata = generateCommitMetadata("00000000000004", CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file4P0C3)));
metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000004"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config, simulateFailureRetry);
// enableBootstrapSourceClean would delete the bootstrap base file as the same time
HoodieCleanStat partitionCleanStat = getCleanStat(hoodieCleanStatsFour, p0);
assertEquals(enableBootstrapSourceClean ? 2 : 1, partitionCleanStat.getSuccessDeleteFiles().size() + (partitionCleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 : partitionCleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least one old file");
assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
assertTrue(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
assertTrue(testTable.baseFileExists(p0, "00000000000003", file1P0C0));
assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p0, "00000000000003", file2P0C1));
assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2));
assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));
if (enableBootstrapSourceClean) {
assertFalse(Files.exists(Paths.get(bootstrapMapping.get(p0).get(0).getBootstrapFileStatus().getPath().getUri())));
}
// No cleaning on partially written file, with no commit.
testTable.forCommit("00000000000005").withBaseFilesInPartition(p0, file3P0C2);
commitMetadata = generateCommitMetadata("00000000000005", CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file3P0C2)));
metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005"));
metaClient.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
List<HoodieCleanStat> hoodieCleanStatsFive = runCleaner(config, simulateFailureRetry);
HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsFive, p0);
assertNull(cleanStat, "Must not clean any files");
assertTrue(testTable.baseFileExists(p0, "00000000000002", file1P0C0));
assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
assertTrue(testTable.baseFileExists(p0, "00000000000005", file3P0C2));
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestCleaner method testFailedInsertAndCleanByCommits.
/**
* Test Helper for Cleaning failed commits by commits logic from HoodieWriteClient API perspective.
*
* @param insertFn Insert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
* @throws Exception in case of errors
*/
private void testFailedInsertAndCleanByCommits(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, boolean isPreppedAPI) throws Exception {
// keep upto 3 commits from the past
int maxCommits = 3;
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withHeartbeatIntervalInMs(3000).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
Pair<String, JavaRDD<WriteStatus>> result = insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
client.commit(result.getLeft(), result.getRight());
HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient);
assertTrue(table.getCompletedCleanTimeline().empty());
insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
Pair<String, JavaRDD<WriteStatus>> ret = insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
// Await till enough time passes such that the last failed commits heartbeats are expired
await().atMost(10, TimeUnit.SECONDS).until(() -> client.getHeartbeatClient().isHeartbeatExpired(ret.getLeft()));
List<HoodieCleanStat> cleanStats = runCleaner(cfg);
assertEquals(0, cleanStats.size(), "Must not clean any files");
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
// Rollback of one of the failed writes should have deleted 3 files
assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestCleaner method testKeepLatestFileVersionsMOR.
/**
* Test HoodieTable.clean() Cleaning by versions logic for MOR table with Log files.
*/
@Test
public void testKeepLatestFileVersionsMOR() throws Exception {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()).build();
HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
HoodieTestTable testTable = HoodieTestTable.of(metaClient);
String p0 = "2020/01/01";
// Make 3 files, one base file and 2 log files associated with base file
String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0);
testTable.forDeltaCommit("000").withLogFile(p0, file1P0, 1).withLogFile(p0, file1P0, 2);
// Make 2 files, one base file and 1 log files associated with base file
testTable.addDeltaCommit("001").withBaseFilesInPartition(p0, file1P0).withLogFile(p0, file1P0, 3);
List<HoodieCleanStat> hoodieCleanStats = runCleaner(config);
assertEquals(3, getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles().size(), "Must clean three files, one base and 2 log files");
assertFalse(testTable.baseFileExists(p0, "000", file1P0));
assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2));
assertTrue(testTable.baseFileExists(p0, "001", file1P0));
assertTrue(testTable.logFileExists(p0, "001", file1P0, 3));
}
Aggregations