use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method collectAllIncrementalFiles.
private static List<FileStatus> collectAllIncrementalFiles(List<HoodieFileGroup> fileGroups, String maxCommitTime, String basePath, Map<String, FileStatus> candidateFileStatus, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
List<FileStatus> result = new ArrayList<>();
fileGroups.stream().forEach(f -> {
try {
List<FileSlice> baseFiles = f.getAllFileSlices().filter(slice -> slice.getBaseFile().isPresent()).collect(Collectors.toList());
if (!baseFiles.isEmpty()) {
FileStatus baseFileStatus = HoodieInputFormatUtils.getFileStatus(baseFiles.get(0).getBaseFile().get());
String baseFilePath = baseFileStatus.getPath().toUri().toString();
if (!candidateFileStatus.containsKey(baseFilePath)) {
throw new HoodieException("Error obtaining fileStatus for file: " + baseFilePath);
}
List<HoodieLogFile> deltaLogFiles = f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList());
// We cannot use baseFileStatus.getPath() here, since baseFileStatus.getPath() missing file size information.
// So we use candidateFileStatus.get(baseFileStatus.getPath()) to get a correct path.
RealtimeFileStatus fileStatus = new RealtimeFileStatus(candidateFileStatus.get(baseFilePath), basePath, deltaLogFiles, true, virtualKeyInfoOpt);
fileStatus.setMaxCommitTime(maxCommitTime);
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
fileStatus.setBootStrapFileStatus(baseFileStatus);
}
result.add(fileStatus);
}
// add file group which has only logs.
if (f.getLatestFileSlice().isPresent() && baseFiles.isEmpty()) {
List<FileStatus> logFileStatus = f.getLatestFileSlice().get().getLogFiles().map(logFile -> logFile.getFileStatus()).collect(Collectors.toList());
if (logFileStatus.size() > 0) {
List<HoodieLogFile> deltaLogFiles = logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList());
RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0), basePath, deltaLogFiles, true, virtualKeyInfoOpt);
fileStatus.setMaxCommitTime(maxCommitTime);
result.add(fileStatus);
}
}
} catch (IOException e) {
throw new HoodieException("Error obtaining data file/log file grouping ", e);
}
});
return result;
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class FileSystemViewCommand method showLatestFileSlices.
@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
public String showLatestFileSlices(@CliOption(key = { "partitionPath" }, help = "A valid partition path", mandatory = true) String partition, @CliOption(key = { "baseFileOnly" }, help = "Only display base file view", unspecifiedDefaultValue = "false") boolean baseFileOnly, @CliOption(key = { "maxInstant" }, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, @CliOption(key = { "merge" }, help = "Merge File Slices due to pending compaction", unspecifiedDefaultValue = "true") final boolean merge, @CliOption(key = { "includeMax" }, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, @CliOption(key = { "includeInflight" }, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") boolean includeInflight, @CliOption(key = { "excludeCompaction" }, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = { "limit" }, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, baseFileOnly, includeMaxInstant, includeInflight, excludeCompaction);
List<Comparable[]> rows = new ArrayList<>();
final Stream<FileSlice> fileSliceStream;
if (!merge) {
fileSliceStream = fsView.getLatestFileSlices(partition);
} else {
if (maxInstant.isEmpty()) {
maxInstant = HoodieCLI.getTableMetaClient().getActiveTimeline().filterCompletedAndCompactionInstants().lastInstant().get().getTimestamp();
}
fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn(partition, maxInstant);
}
fileSliceStream.forEach(fs -> {
int idx = 0;
Comparable[] row = new Comparable[baseFileOnly ? 5 : 13];
row[idx++] = partition;
row[idx++] = fs.getFileId();
row[idx++] = fs.getBaseInstantTime();
row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getPath() : "";
long dataFileSize = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getFileSize() : -1;
row[idx++] = dataFileSize;
if (!baseFileOnly) {
row[idx++] = fs.getLogFiles().count();
row[idx++] = fs.getLogFiles().mapToLong(HoodieLogFile::getFileSize).sum();
long logFilesScheduledForCompactionTotalSize = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).mapToLong(HoodieLogFile::getFileSize).sum();
row[idx++] = logFilesScheduledForCompactionTotalSize;
long logFilesUnscheduledTotalSize = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).mapToLong(HoodieLogFile::getFileSize).sum();
row[idx++] = logFilesUnscheduledTotalSize;
double logSelectedForCompactionToBaseRatio = dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
row[idx++] = logSelectedForCompactionToBaseRatio;
double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
row[idx++] = logUnscheduledToBaseRatio;
row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).collect(Collectors.toList()).toString();
row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).collect(Collectors.toList()).toString();
}
rows.add(row);
});
Function<Object, String> converterFunction = entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())));
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE, converterFunction);
if (!baseFileOnly) {
fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_SIZE, converterFunction);
fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DELTA_SIZE_SCHEDULED, converterFunction);
fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DELTA_SIZE_UNSCHEDULED, converterFunction);
}
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION).addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID).addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE);
if (!baseFileOnly) {
header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELTA_FILES).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_SIZE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_SIZE_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_SIZE_UNSCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_UNSCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_UNSCHEDULED);
}
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class HoodieTableMetadataUtil method getPartitionFileSlices.
/**
* Get the latest file slices for a given partition.
*
* @param metaClient - Instance of {@link HoodieTableMetaClient}.
* @param partition - The name of the partition whose file groups are to be loaded.
* @param mergeFileSlices - When enabled, will merge the latest file slices with the last known
* completed instant. This is useful for readers when there are pending
* compactions. MergeFileSlices when disabled, will return the latest file
* slices without any merging, and this is needed for the writers.
* @return List of latest file slices for all file groups in a given partition.
*/
private static List<FileSlice> getPartitionFileSlices(HoodieTableMetaClient metaClient, Option<HoodieTableFileSystemView> fileSystemView, String partition, boolean mergeFileSlices) {
HoodieTableFileSystemView fsView = fileSystemView.orElse(getFileSystemView(metaClient));
Stream<FileSlice> fileSliceStream;
if (mergeFileSlices) {
fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn(partition, metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().get().getTimestamp());
} else {
fileSliceStream = fsView.getLatestFileSlices(partition);
}
return fileSliceStream.sorted(Comparator.comparing(FileSlice::getFileId)).collect(Collectors.toList());
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class CleanPlanner method getFilesToCleanKeepingLatestVersions.
/**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
* policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
* single file (i.e run it with versionsRetained = 1)
*/
private List<CleanFileInfo> getFilesToCleanKeepingLatestVersions(String partitionPath) {
LOG.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained() + " file versions. ");
List<CleanFileInfo> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints
List<String> savepointedFiles = hoodieTable.getSavepoints().stream().flatMap(this::getSavepointedDataFiles).collect(Collectors.toList());
// In this scenario, we will assume that once replaced a file group automatically becomes eligible for cleaning completely
// In other words, the file versions only apply to the active file groups.
deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, Option.empty()));
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
int keepVersions = config.getCleanerFileVersionsRetained();
// do not cleanup slice required for pending compaction
Iterator<FileSlice> fileSliceIterator = fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
if (isFileGroupInPendingCompaction(fileGroup)) {
// We have already saved the last version of file-groups for pending compaction Id
keepVersions--;
}
while (fileSliceIterator.hasNext() && keepVersions > 0) {
// Skip this most recent version
FileSlice nextSlice = fileSliceIterator.next();
Option<HoodieBaseFile> dataFile = nextSlice.getBaseFile();
if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) {
// do not clean up a savepoint data file
continue;
}
keepVersions--;
}
// Delete the remaining files
while (fileSliceIterator.hasNext()) {
FileSlice nextSlice = fileSliceIterator.next();
deletePaths.addAll(getCleanFileInfoForSlice(nextSlice));
}
}
return deletePaths;
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class CleanPlanner method getFilesToCleanKeepingLatestCommits.
/**
* Selects the versions for file for cleaning, such that it
* <p>
* - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
* has occurred in last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
* window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
* This is 5 hours by default (assuming ingestion is running every 30 minutes). This is essential to leave the file
* used by the query that is running for the max time.
* <p>
* This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
* retain 10 commits, and commit batch time is 30 mins, then you have 5 hrs of lookback)
* <p>
* This policy is the default.
*/
private List<CleanFileInfo> getFilesToCleanKeepingLatestCommits(String partitionPath, int commitsRetained, HoodieCleaningPolicy policy) {
LOG.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
List<CleanFileInfo> deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints
List<String> savepointedFiles = hoodieTable.getSavepoints().stream().flatMap(this::getSavepointedDataFiles).collect(Collectors.toList());
// determine if we have enough commits, to start cleaning.
if (commitTimeline.countInstants() > commitsRetained) {
Option<HoodieInstant> earliestCommitToRetainOption = getEarliestCommitToRetain();
HoodieInstant earliestCommitToRetain = earliestCommitToRetainOption.get();
// all replaced file groups before earliestCommitToRetain are eligible to clean
deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, earliestCommitToRetainOption));
// add active files
List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
if (fileSliceList.isEmpty()) {
continue;
}
String lastVersion = fileSliceList.get(0).getBaseInstantTime();
String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
// i.e always spare the last commit.
for (FileSlice aSlice : fileSliceList) {
Option<HoodieBaseFile> aFile = aSlice.getBaseFile();
String fileCommitTime = aSlice.getBaseInstantTime();
if (aFile.isPresent() && savepointedFiles.contains(aFile.get().getFileName())) {
// do not clean up a savepoint data file
continue;
}
if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) {
// uses this file.
if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) {
// move on to the next file
continue;
}
} else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) {
// Do not delete the latest commit.
if (fileCommitTime.equals(lastVersion)) {
// move on to the next file
continue;
}
}
// Always keep the last commit
if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline.compareTimestamps(earliestCommitToRetain.getTimestamp(), HoodieTimeline.GREATER_THAN, fileCommitTime)) {
// this is a commit, that should be cleaned.
aFile.ifPresent(hoodieDataFile -> {
deletePaths.add(new CleanFileInfo(hoodieDataFile.getPath(), false));
if (hoodieDataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) {
deletePaths.add(new CleanFileInfo(hoodieDataFile.getBootstrapBaseFile().get().getPath(), true));
}
});
if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
// If merge on read, then clean the log files for the commits as well
deletePaths.addAll(aSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)).collect(Collectors.toList()));
}
}
}
}
}
return deletePaths;
}
Aggregations