use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestCleaner method testFailedInsertAndCleanByCommits.
/**
* Test Helper for Cleaning failed commits by commits logic from HoodieWriteClient API perspective.
*
* @param insertFn Insert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
* @throws Exception in case of errors
*/
private void testFailedInsertAndCleanByCommits(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, boolean isPreppedAPI) throws Exception {
// keep upto 3 commits from the past
int maxCommits = 3;
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withHeartbeatIntervalInMs(3000).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
Pair<String, JavaRDD<WriteStatus>> result = insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
client.commit(result.getLeft(), result.getRight());
HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient);
assertTrue(table.getCompletedCleanTimeline().empty());
insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
Pair<String, JavaRDD<WriteStatus>> ret = insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
// Await till enough time passes such that the last failed commits heartbeats are expired
await().atMost(10, TimeUnit.SECONDS).until(() -> client.getHeartbeatClient().isHeartbeatExpired(ret.getLeft()));
List<HoodieCleanStat> cleanStats = runCleaner(cfg);
assertEquals(0, cleanStats.size(), "Must not clean any files");
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
// Rollback of one of the failed writes should have deleted 3 files
assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieMergeOnReadTable method testMetadataStatsOnCommit.
/**
* Test to ensure metadata stats are correctly written to metadata file.
*/
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Exception {
HoodieWriteConfig cfg = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY).withAutoCommit(false).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient);
// Create a commit without metadata stats in metadata to test backwards compatibility
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = table.getMetaClient().getCommitActionType();
HoodieInstant instant = new HoodieInstant(State.REQUESTED, commitActionType, "000");
activeTimeline.createNewInstant(instant);
activeTimeline.transitionRequestedToInflight(instant, Option.empty());
instant = new HoodieInstant(State.INFLIGHT, commitActionType, "000");
activeTimeline.saveAsComplete(instant, Option.empty());
String instantTime = "001";
client.startCommitWithTime(instantTime);
List<HoodieRecord> records = dataGen.generateInserts(instantTime, 200);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
JavaRDD<WriteStatus> statuses = client.insert(writeRecords, instantTime);
assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
// Read from commit file
table = HoodieSparkTable.create(cfg, context());
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
int inserts = 0;
for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
for (HoodieWriteStat stat : pstat.getValue()) {
inserts += stat.getNumInserts();
}
}
assertEquals(200, inserts);
instantTime = "002";
client.startCommitWithTime(instantTime);
records = dataGen.generateUpdates(instantTime, records);
writeRecords = jsc().parallelize(records, 1);
statuses = client.upsert(writeRecords, instantTime);
// assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
inserts = 0;
int upserts = 0;
List<WriteStatus> writeStatusList = statuses.collect();
for (WriteStatus ws : writeStatusList) {
inserts += ws.getStat().getNumInserts();
upserts += ws.getStat().getNumUpdateWrites();
}
// Read from commit file
assertEquals(0, inserts);
assertEquals(200, upserts);
client.rollback(instantTime);
// Read from commit file
table = HoodieSparkTable.create(cfg, context());
metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
inserts = 0;
upserts = 0;
for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
for (HoodieWriteStat stat : pstat.getValue()) {
inserts += stat.getNumInserts();
upserts += stat.getNumUpdateWrites();
}
}
assertEquals(200, inserts);
assertEquals(0, upserts);
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieTimelineArchiver method testMergeSmallArchiveFilesRecoverFromMergeFailed.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMergeSmallArchiveFilesRecoverFromMergeFailed(boolean enableArchiveMerge) throws Exception {
HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
// do ingestion and trigger archive actions here.
for (int i = 1; i < 8; i++) {
testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
// do a single merge small archive files
HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
archiver.reOpenWriter();
archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1");
archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList()));
HoodieLogFormat.Writer writer = archiver.reOpenWriter();
// check loading archived and active timeline success
HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload();
assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.reload().countInstants());
String s = "Dummy Content";
// stain the current merged archive file.
FileIOUtils.createFileInPath(metaClient.getFs(), writer.getLogFile().getPath(), Option.of(s.getBytes()));
// do another archive actions with merge small archive files.
for (int i = 1; i < 10; i++) {
testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
// check result.
// we need to load archived timeline successfully and ignore the parsing damage merged archive files exception.
HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload();
assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
// if there are a damaged merged archive files and other common damaged archive file.
// hoodie need throw ioe while loading archived timeline because of parsing the damaged archive file.
Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1");
FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes()));
assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload());
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieTimelineArchiver method testLoadArchiveTimelineWithDamagedPlanFile.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testLoadArchiveTimelineWithDamagedPlanFile(boolean enableArchiveMerge) throws Exception {
HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
// do ingestion and trigger archive actions here.
for (int i = 1; i < 8; i++) {
testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
Path plan = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
String s = "Dummy Content";
// stain the current merge plan file.
FileIOUtils.createFileInPath(metaClient.getFs(), plan, Option.of(s.getBytes()));
// check that damaged plan file will not block archived timeline loading.
HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload();
assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants());
// if there are damaged archive files and damaged plan, hoodie need throw ioe while loading archived timeline.
Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1");
FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes()));
assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload());
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class HoodieRepairTool method doRepair.
/**
* Does repair, either in REPAIR or DRY_RUN mode.
*
* @param startingInstantOption {@link Option} of starting instant for scanning, can be empty.
* @param endingInstantOption {@link Option} of ending instant for scanning, can be empty.
* @param isDryRun Is dry run.
* @throws IOException upon errors.
*/
boolean doRepair(Option<String> startingInstantOption, Option<String> endingInstantOption, boolean isDryRun) throws IOException {
// Scans all partitions to find base and log files in the base path
List<Path> allFilesInPartitions = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
// Buckets the files based on instant time
// instant time -> relative paths of base and log files to base path
Map<String, List<String>> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(metaClient.getBasePath(), allFilesInPartitions);
List<String> instantTimesToRepair = instantToFilesMap.keySet().stream().filter(instant -> (!startingInstantOption.isPresent() || instant.compareTo(startingInstantOption.get()) >= 0) && (!endingInstantOption.isPresent() || instant.compareTo(endingInstantOption.get()) <= 0)).collect(Collectors.toList());
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline();
// This assumes that the archived timeline only has completed instants so this is safe
archivedTimeline.loadCompletedInstantDetailsInMemory();
List<ImmutablePair<String, List<String>>> instantFilesToRemove = context.parallelize(instantTimesToRepair).map(instantToRepair -> new ImmutablePair<>(instantToRepair, RepairUtils.findInstantFilesToRemove(instantToRepair, instantToFilesMap.get(instantToRepair), activeTimeline, archivedTimeline))).collectAsList();
List<ImmutablePair<String, List<String>>> instantsWithDanglingFiles = instantFilesToRemove.stream().filter(e -> !e.getValue().isEmpty()).collect(Collectors.toList());
printRepairInfo(instantTimesToRepair, instantsWithDanglingFiles);
if (!isDryRun) {
List<String> relativeFilePathsToDelete = instantsWithDanglingFiles.stream().flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
if (relativeFilePathsToDelete.size() > 0) {
if (!backupFiles(relativeFilePathsToDelete)) {
LOG.error("Error backing up dangling files. Exiting...");
return false;
}
return deleteFiles(context, cfg.basePath, relativeFilePathsToDelete);
}
LOG.info(String.format("Table repair on %s is successful", cfg.basePath));
}
return true;
}
Aggregations