use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class AbstractTableFileSystemView method sync.
@Override
public void sync() {
HoodieTimeline oldTimeline = getTimeline();
HoodieTimeline newTimeline = metaClient.reloadActiveTimeline().filterCompletedAndCompactionInstants();
try {
writeLock.lock();
runSync(oldTimeline, newTimeline);
} finally {
writeLock.unlock();
}
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class HoodieMetadataTableValidator method doMetadataTableValidation.
public void doMetadataTableValidation() {
boolean finalResult = true;
metaClient.reloadActiveTimeline();
String basePath = metaClient.getBasePath();
Set<String> baseFilesForCleaning = Collections.emptySet();
if (cfg.skipDataFilesForCleaning) {
HoodieTimeline inflightCleaningTimeline = metaClient.getActiveTimeline().getCleanerTimeline().filterInflights();
baseFilesForCleaning = inflightCleaningTimeline.getInstants().flatMap(instant -> {
try {
// convert inflight instant to requested and get clean plan
instant = new HoodieInstant(HoodieInstant.State.REQUESTED, instant.getAction(), instant.getTimestamp());
HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(metaClient, instant);
return cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().flatMap(cleanerFileInfoList -> {
return cleanerFileInfoList.stream().map(fileInfo -> {
return new Path(fileInfo.getFilePath()).getName();
});
});
} catch (IOException e) {
throw new HoodieIOException("Error reading cleaner metadata for " + instant);
}
// only take care of base files here.
}).filter(path -> {
String fileExtension = FSUtils.getFileExtension(path);
return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(fileExtension);
}).collect(Collectors.toSet());
}
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
List<String> allPartitions = validatePartitions(engineContext, basePath);
HoodieMetadataValidationContext metadataTableBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, true);
HoodieMetadataValidationContext fsBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, false);
Set<String> finalBaseFilesForCleaning = baseFilesForCleaning;
List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
try {
validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
LOG.info("Metadata table validation succeeded for " + partitionPath);
return true;
} catch (HoodieValidationException e) {
LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e);
if (!cfg.ignoreFailed) {
throw e;
}
return false;
}
}).collectAsList();
for (Boolean res : result) {
finalResult &= res;
}
if (finalResult) {
LOG.info("Metadata table validation succeeded.");
} else {
LOG.warn("Metadata table validation failed.");
}
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class RequestHandler method syncIfLocalViewBehind.
/**
* Syncs data-set view if local view is behind.
*/
private boolean syncIfLocalViewBehind(Context ctx) {
if (isLocalViewBehind(ctx)) {
String basePath = ctx.queryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM);
String lastKnownInstantFromClient = ctx.queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, HoodieTimeline.INVALID_INSTANT_TS);
SyncableFileSystemView view = viewManager.getFileSystemView(basePath);
synchronized (view) {
if (isLocalViewBehind(ctx)) {
HoodieTimeline localTimeline = viewManager.getFileSystemView(basePath).getTimeline();
LOG.info("Syncing view as client passed last known instant " + lastKnownInstantFromClient + " as last known instant but server has the following last instant on timeline :" + localTimeline.lastInstant());
view.sync();
return true;
}
}
}
return false;
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class TestCleaner method testInsertAndCleanByCommits.
/**
* Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective.
*
* @param insertFn Insert API to be tested
* @param upsertFn Upsert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
* @throws Exception in case of errors
*/
private void testInsertAndCleanByCommits(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI) throws Exception {
// keep upto 3 commits from the past
int maxCommits = 3;
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
// Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
makeIncrementalCommitTimes(8).forEach(newCommitTime -> {
try {
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100);
List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table1 = HoodieSparkTable.create(cfg, context, metaClient);
HoodieTimeline activeTimeline = table1.getCompletedCommitsTimeline();
// NOTE: See CleanPlanner#getFilesToCleanKeepingLatestCommits. We explicitly keep one commit before earliest
// commit
Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits);
Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
if (earliestRetainedCommit.isPresent()) {
acceptableCommits.removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants().collect(Collectors.toSet()));
acceptableCommits.add(earliestRetainedCommit.get());
}
TableFileSystemView fsView = table1.getFileSystemView();
// Need to ensure the following
for (String partitionPath : dataGen.getPartitionPaths()) {
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
Set<String> commitTimes = new HashSet<>();
fileGroup.getAllBaseFiles().forEach(value -> {
LOG.debug("Data File - " + value);
commitTimes.add(value.getCommitTime());
});
assertEquals(acceptableCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()), commitTimes, "Only contain acceptable versions of file should be present");
}
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
});
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class TestCleaner method testInsertAndCleanByVersions.
/**
* Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective.
*
* @param insertFn Insert API to be tested
* @param upsertFn Upsert API to be tested
* @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
* record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
* @throws Exception in case of errors
*/
private void testInsertAndCleanByVersions(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI) throws Exception {
// keep upto 2 versions for each file
int maxVersions = 2;
HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS);
Map<HoodieFileGroupId, FileSlice> compactionFileIdToLatestFileSlice = new HashMap<>();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
for (String partitionPath : dataGen.getPartitionPaths()) {
TableFileSystemView fsView = table.getFileSystemView();
Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
return true;
}));
if (added.isPresent()) {
// Select only one file-group for compaction
break;
}
}
// Create workload with selected file-slices
List<Pair<String, FileSlice>> partitionFileSlicePairs = compactionFileIdToLatestFileSlice.entrySet().stream().map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList());
HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty());
List<String> instantTimes = makeIncrementalCommitTimes(9, 1, 10);
String compactionTime = instantTimes.get(0);
table.getActiveTimeline().saveToCompactionRequested(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
instantTimes = instantTimes.subList(1, instantTimes.size());
// remaining.
for (String newInstantTime : instantTimes) {
try {
client.startCommitWithTime(newInstantTime);
List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieSparkTable.create(getConfig(), context, metaClient);
HoodieTimeline timeline = table.getMetaClient().getCommitsTimeline();
TableFileSystemView fsView = table.getFileSystemView();
// Need to ensure the following
for (String partitionPath : dataGen.getPartitionPaths()) {
// compute all the versions of all files, from time 0
HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
if (!fileIdToVersions.containsKey(wstat.getFileId())) {
fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
}
fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
}
}
List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
if (compactionFileIdToLatestFileSlice.containsKey(fileGroup.getFileGroupId())) {
// Ensure latest file-slice selected for compaction is retained
Option<HoodieBaseFile> dataFileForCompactionPresent = Option.fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> {
return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime().equals(df.getCommitTime());
}).findAny());
assertTrue(dataFileForCompactionPresent.isPresent(), "Data File selected for compaction is retained");
} else {
// file has no more than max versions
String fileId = fileGroup.getFileGroupId().getFileId();
List<HoodieBaseFile> dataFiles = fileGroup.getAllBaseFiles().collect(Collectors.toList());
assertTrue(dataFiles.size() <= maxVersions, "fileId " + fileId + " has more than " + maxVersions + " versions");
// Each file, has the latest N versions (i.e cleaning gets rid of older versions)
List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
for (int i = 0; i < dataFiles.size(); i++) {
assertEquals((dataFiles.get(i)).getCommitTime(), commitedVersions.get(commitedVersions.size() - 1 - i), "File " + fileId + " does not have latest versions on commits" + commitedVersions);
}
}
}
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
}
}
Aggregations