use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class BucketStreamWriteFunction method bootstrapIndex.
/**
* Get partition_bucket -> fileID mapping from the existing hudi table.
* This is a required operation for each restart to avoid having duplicate file ids for one bucket.
*/
private void bootstrapIndex() throws IOException {
Option<HoodieInstant> latestCommitTime = table.getFileSystemView().getTimeline().filterCompletedInstants().lastInstant();
if (!latestCommitTime.isPresent()) {
return;
}
// bootstrap bucket info from existing file system
// bucketNum % totalParallelism == this taskID belongs to this task
HashSet<Integer> bucketToLoad = new HashSet<>();
for (int i = 0; i < bucketNum; i++) {
int partitionOfBucket = BucketIdentifier.mod(i, parallelism);
if (partitionOfBucket == taskID) {
LOG.info(String.format("Bootstrapping index. Adding bucket %s , " + "Current parallelism: %s , Max parallelism: %s , Current task id: %s", i, parallelism, maxParallelism, taskID));
bucketToLoad.add(i);
}
}
bucketToLoad.forEach(bucket -> LOG.info(String.format("bucketToLoad contains %s", bucket)));
LOG.info(String.format("Loading Hoodie Table %s, with path %s", table.getMetaClient().getTableConfig().getTableName(), table.getMetaClient().getBasePath()));
// Iterate through all existing partitions to load existing fileID belongs to this task
List<String> partitions = table.getMetadata().getAllPartitionPaths();
for (String partitionPath : partitions) {
List<FileSlice> latestFileSlices = table.getSliceView().getLatestFileSlices(partitionPath).collect(toList());
for (FileSlice fileslice : latestFileSlices) {
String fileID = fileslice.getFileId();
int bucketNumber = BucketIdentifier.bucketIdFromFileId(fileID);
if (bucketToLoad.contains(bucketNumber)) {
String partitionBucketId = BucketIdentifier.partitionBucketIdStr(partitionPath, bucketNumber);
LOG.info(String.format("Should load this partition bucket %s with fileID %s", partitionBucketId, fileID));
if (bucketToFileIDMap.containsKey(partitionBucketId)) {
throw new RuntimeException(String.format("Duplicate fileID %s from partitionBucket %s found " + "during the BucketStreamWriteFunction index bootstrap.", fileID, partitionBucketId));
} else {
LOG.info(String.format("Adding fileID %s to the partition bucket %s.", fileID, partitionBucketId));
bucketToFileIDMap.put(partitionBucketId, fileID);
}
}
}
}
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class DeltaWriteProfile method smallFilesProfile.
@Override
protected List<SmallFile> smallFilesProfile(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
// Init here since this class (and member variables) might not have been initialized
HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
// Find out all eligible small file slices
if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
// find the smallest file in partition and append to it
List<FileSlice> allSmallFileSlices = new ArrayList<>();
// If we can index log files, we can add more inserts to log files for fileIds including those under
// pending compaction.
List<FileSlice> allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).collect(Collectors.toList());
for (FileSlice fileSlice : allFileSlices) {
if (isSmallFile(fileSlice)) {
allSmallFileSlices.add(fileSlice);
}
}
// Create SmallFiles from the eligible file slices
for (FileSlice smallFileSlice : allSmallFileSlices) {
SmallFile sf = new SmallFile();
if (smallFileSlice.getBaseFile().isPresent()) {
// TODO : Move logic of file name, file id, base commit time handling inside file slice
String filename = smallFileSlice.getBaseFile().get().getFileName();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
sf.sizeBytes = getTotalFileSize(smallFileSlice);
smallFileLocations.add(sf);
} else {
smallFileSlice.getLogFiles().findFirst().ifPresent(logFile -> {
// in case there is something error, and the file slice has no log file
sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
sf.sizeBytes = getTotalFileSize(smallFileSlice);
smallFileLocations.add(sf);
});
}
}
}
return smallFileLocations;
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class WriteProfile method smallFilesProfile.
/**
* Returns a list of small files in the given partition path from the latest filesystem view.
*/
protected List<SmallFile> smallFilesProfile(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
if (!commitTimeline.empty()) {
// if we have some commits
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
List<HoodieBaseFile> allFiles = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
for (HoodieBaseFile file : allFiles) {
// filter out the corrupted files.
if (file.getFileSize() < config.getParquetSmallFileLimit() && file.getFileSize() > 0) {
String filename = file.getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
sf.sizeBytes = file.getFileSize();
smallFileLocations.add(sf);
}
}
}
return smallFileLocations;
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class HoodieHiveClient method updateLastCommitTimeSynced.
@Override
public void updateLastCommitTimeSynced(String tableName) {
// Set the last commit time from the TBLproperties
Option<String> lastCommitSynced = activeTimeline.lastInstant().map(HoodieInstant::getTimestamp);
if (lastCommitSynced.isPresent()) {
try {
Table table = client.getTable(syncConfig.databaseName, tableName);
table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced.get());
client.alter_table(syncConfig.databaseName, tableName, table);
} catch (Exception e) {
throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e);
}
}
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class HoodieBackedTableMetadataWriter method update.
/**
* Update from {@code HoodieRollbackMetadata}.
*
* @param rollbackMetadata {@code HoodieRollbackMetadata}
* @param instantTime Timestamp at which the rollback was performed
*/
@Override
public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) {
if (enabled && metadata != null) {
// Is this rollback of an instant that has been synced to the metadata table?
String rollbackInstant = rollbackMetadata.getCommitsRollback().get(0);
boolean wasSynced = metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, rollbackInstant));
if (!wasSynced) {
// A compaction may have taken place on metadata table which would have included this instant being rolled back.
// Revisit this logic to relax the compaction fencing : https://issues.apache.org/jira/browse/HUDI-2458
Option<String> latestCompaction = metadata.getLatestCompactionTime();
if (latestCompaction.isPresent()) {
wasSynced = HoodieTimeline.compareTimestamps(rollbackInstant, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCompaction.get());
}
}
Map<MetadataPartitionType, HoodieData<HoodieRecord>> records = HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, metadataMetaClient.getActiveTimeline(), rollbackMetadata, getRecordsGenerationParams(), instantTime, metadata.getSyncedInstantTime(), wasSynced);
commit(instantTime, records, false);
}
}
Aggregations