Search in sources :

Example 1 with CleanFileInfo

use of org.apache.hudi.common.model.CleanFileInfo in project hudi by apache.

the class CleanPlanner method getFilesToCleanKeepingLatestVersions.

/**
 * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
 * policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
 * single file (i.e run it with versionsRetained = 1)
 */
private List<CleanFileInfo> getFilesToCleanKeepingLatestVersions(String partitionPath) {
    LOG.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained() + " file versions. ");
    List<CleanFileInfo> deletePaths = new ArrayList<>();
    // Collect all the datafiles savepointed by all the savepoints
    List<String> savepointedFiles = hoodieTable.getSavepoints().stream().flatMap(this::getSavepointedDataFiles).collect(Collectors.toList());
    // In this scenario, we will assume that once replaced a file group automatically becomes eligible for cleaning completely
    // In other words, the file versions only apply to the active file groups.
    deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, Option.empty()));
    List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
    for (HoodieFileGroup fileGroup : fileGroups) {
        int keepVersions = config.getCleanerFileVersionsRetained();
        // do not cleanup slice required for pending compaction
        Iterator<FileSlice> fileSliceIterator = fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
        if (isFileGroupInPendingCompaction(fileGroup)) {
            // We have already saved the last version of file-groups for pending compaction Id
            keepVersions--;
        }
        while (fileSliceIterator.hasNext() && keepVersions > 0) {
            // Skip this most recent version
            FileSlice nextSlice = fileSliceIterator.next();
            Option<HoodieBaseFile> dataFile = nextSlice.getBaseFile();
            if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) {
                // do not clean up a savepoint data file
                continue;
            }
            keepVersions--;
        }
        // Delete the remaining files
        while (fileSliceIterator.hasNext()) {
            FileSlice nextSlice = fileSliceIterator.next();
            deletePaths.addAll(getCleanFileInfoForSlice(nextSlice));
        }
    }
    return deletePaths;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ZonedDateTime(java.time.ZonedDateTime) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CleanPlanV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler) ArrayList(java.util.ArrayList) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Map(java.util.Map) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieSavepointException(org.apache.hudi.exception.HoodieSavepointException) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Stream(java.util.stream.Stream) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 2 with CleanFileInfo

use of org.apache.hudi.common.model.CleanFileInfo in project hudi by apache.

the class CleanPlanner method getFilesToCleanKeepingLatestCommits.

/**
 * Selects the versions for file for cleaning, such that it
 * <p>
 * - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
 * has occurred in last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
 * window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
 * This is 5 hours by default (assuming ingestion is running every 30 minutes). This is essential to leave the file
 * used by the query that is running for the max time.
 * <p>
 * This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
 * retain 10 commits, and commit batch time is 30 mins, then you have 5 hrs of lookback)
 * <p>
 * This policy is the default.
 */
private List<CleanFileInfo> getFilesToCleanKeepingLatestCommits(String partitionPath, int commitsRetained, HoodieCleaningPolicy policy) {
    LOG.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
    List<CleanFileInfo> deletePaths = new ArrayList<>();
    // Collect all the datafiles savepointed by all the savepoints
    List<String> savepointedFiles = hoodieTable.getSavepoints().stream().flatMap(this::getSavepointedDataFiles).collect(Collectors.toList());
    // determine if we have enough commits, to start cleaning.
    if (commitTimeline.countInstants() > commitsRetained) {
        Option<HoodieInstant> earliestCommitToRetainOption = getEarliestCommitToRetain();
        HoodieInstant earliestCommitToRetain = earliestCommitToRetainOption.get();
        // all replaced file groups before earliestCommitToRetain are eligible to clean
        deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, earliestCommitToRetainOption));
        // add active files
        List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
        for (HoodieFileGroup fileGroup : fileGroups) {
            List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
            if (fileSliceList.isEmpty()) {
                continue;
            }
            String lastVersion = fileSliceList.get(0).getBaseInstantTime();
            String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
            // i.e always spare the last commit.
            for (FileSlice aSlice : fileSliceList) {
                Option<HoodieBaseFile> aFile = aSlice.getBaseFile();
                String fileCommitTime = aSlice.getBaseInstantTime();
                if (aFile.isPresent() && savepointedFiles.contains(aFile.get().getFileName())) {
                    // do not clean up a savepoint data file
                    continue;
                }
                if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) {
                    // uses this file.
                    if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) {
                        // move on to the next file
                        continue;
                    }
                } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) {
                    // Do not delete the latest commit.
                    if (fileCommitTime.equals(lastVersion)) {
                        // move on to the next file
                        continue;
                    }
                }
                // Always keep the last commit
                if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline.compareTimestamps(earliestCommitToRetain.getTimestamp(), HoodieTimeline.GREATER_THAN, fileCommitTime)) {
                    // this is a commit, that should be cleaned.
                    aFile.ifPresent(hoodieDataFile -> {
                        deletePaths.add(new CleanFileInfo(hoodieDataFile.getPath(), false));
                        if (hoodieDataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) {
                            deletePaths.add(new CleanFileInfo(hoodieDataFile.getBootstrapBaseFile().get().getPath(), true));
                        }
                    });
                    if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
                        // If merge on read, then clean the log files for the commits as well
                        deletePaths.addAll(aSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)).collect(Collectors.toList()));
                    }
                }
            }
        }
    }
    return deletePaths;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 3 with CleanFileInfo

use of org.apache.hudi.common.model.CleanFileInfo in project hudi by apache.

the class CleanPlanner method getDeletePaths.

/**
 * Returns files to be cleaned for the given partitionPath based on cleaning policy.
 */
public List<CleanFileInfo> getDeletePaths(String partitionPath) {
    HoodieCleaningPolicy policy = config.getCleanerPolicy();
    List<CleanFileInfo> deletePaths;
    if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) {
        deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath);
    } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
        deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath);
    } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) {
        deletePaths = getFilesToCleanKeepingLatestHours(partitionPath);
    } else {
        throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name());
    }
    LOG.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath);
    return deletePaths;
}
Also used : CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy)

Example 4 with CleanFileInfo

use of org.apache.hudi.common.model.CleanFileInfo in project hudi by apache.

the class CleanPlanner method getCleanFileInfoForSlice.

private List<CleanFileInfo> getCleanFileInfoForSlice(FileSlice nextSlice) {
    List<CleanFileInfo> cleanPaths = new ArrayList<>();
    if (nextSlice.getBaseFile().isPresent()) {
        HoodieBaseFile dataFile = nextSlice.getBaseFile().get();
        cleanPaths.add(new CleanFileInfo(dataFile.getPath(), false));
        if (dataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) {
            cleanPaths.add(new CleanFileInfo(dataFile.getBootstrapBaseFile().get().getPath(), true));
        }
    }
    if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
        // If merge on read, then clean the log files for the commits as well
        cleanPaths.addAll(nextSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)).collect(Collectors.toList()));
    }
    return cleanPaths;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) ArrayList(java.util.ArrayList)

Example 5 with CleanFileInfo

use of org.apache.hudi.common.model.CleanFileInfo in project hudi by apache.

the class CleanActionExecutor method clean.

/**
 * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
 * skews in partitions to clean by making files to clean as the unit of task distribution.
 *
 * @throws IllegalArgumentException if unknown cleaning policy is provided
 */
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
    int cleanerParallelism = Math.min((int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), config.getCleanerParallelism());
    LOG.info("Using cleanerParallelism: " + cleanerParallelism);
    context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
    Stream<Pair<String, CleanFileInfo>> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream().flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
    Stream<ImmutablePair<String, PartitionCleanStat>> partitionCleanStats = context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
    Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Return PartitionCleanStat for each partition passed.
    return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
        PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) ? partitionCleanStatsMap.get(partitionPath) : new PartitionCleanStat(partitionPath);
        HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
        return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath).withEarliestCommitRetained(Option.ofNullable(actionInstant != null ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), actionInstant.getAction(), actionInstant.getTimestamp()) : null)).withDeletePathPattern(partitionCleanStat.deletePathPatterns()).withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()).withFailedDeletes(partitionCleanStat.failedDeleteFiles()).withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()).withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()).withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()).build();
    }).collect(Collectors.toList());
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) TransactionManager(org.apache.hudi.client.transaction.TransactionManager) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Stream(java.util.stream.Stream) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ArrayList(java.util.ArrayList) List(java.util.List) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

CleanFileInfo (org.apache.hudi.common.model.CleanFileInfo)5 ArrayList (java.util.ArrayList)4 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)3 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)3 IOException (java.io.IOException)2 Iterator (java.util.Iterator)2 List (java.util.List)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 Stream (java.util.stream.Stream)2 HoodieCleanMetadata (org.apache.hudi.avro.model.HoodieCleanMetadata)2 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)2 FileSlice (org.apache.hudi.common.model.FileSlice)2 HoodieCleaningPolicy (org.apache.hudi.common.model.HoodieCleaningPolicy)2 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)2 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)2 TimelineMetadataUtils (org.apache.hudi.common.table.timeline.TimelineMetadataUtils)2 Option (org.apache.hudi.common.util.Option)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2