Search in sources :

Example 26 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestClusteringUtils method generateFileSlice.

private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) {
    FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant);
    fs.setBaseFile(new HoodieBaseFile(FSUtils.makeDataFileName(baseInstant, "1-0-1", fileId)));
    return fs;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) FileSlice(org.apache.hudi.common.model.FileSlice)

Example 27 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieInputFormatUtils method refreshFileStatus.

/**
 * Checks the file status for a race condition which can set the file size to 0. 1. HiveInputFormat does
 * super.listStatus() and gets back a FileStatus[] 2. Then it creates the HoodieTableMetaClient for the paths listed.
 * 3. Generation of splits looks at FileStatus size to create splits, which skips this file
 * @param conf
 * @param dataFile
 * @return
 */
private static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFile dataFile) {
    Path dataPath = dataFile.getFileStatus().getPath();
    try {
        if (dataFile.getFileSize() == 0) {
            FileSystem fs = dataPath.getFileSystem(conf);
            LOG.info("Refreshing file status " + dataFile.getPath());
            return new HoodieBaseFile(fs.getFileStatus(dataPath), dataFile.getBootstrapBaseFile().orElse(null));
        }
        return dataFile;
    } catch (IOException e) {
        throw new HoodieIOException("Could not get FileStatus on path " + dataPath);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 28 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieRealtimeInputFormatUtils method groupLogsByBaseFile.

// Return parquet file with a list of log files in the same file group.
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
    Set<Path> partitionSet = new HashSet<>(partitionPaths);
    // TODO(vc): Should we handle also non-hoodie splits here?
    Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
    // Get all the base file and it's log files pairs in required partition paths.
    List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
    partitionSet.forEach(partitionPath -> {
        // for each partition path obtain the data & log file groupings, then map back to inputsplits
        HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
        HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
        String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
        try {
            // Both commit and delta-commits are included - pick the latest completed one
            Option<HoodieInstant> latestCompletedInstant = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
            Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())).orElse(Stream.empty());
            latestFileSlices.forEach(fileSlice -> {
                List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
                baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
            });
        } catch (Exception e) {
            throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
        }
    });
    return baseAndLogsList;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRealtimeBootstrapBaseFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit) Logger(org.apache.log4j.Logger) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) RealtimeSplit(org.apache.hudi.hadoop.realtime.RealtimeSplit) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) TypeUtils.unsafeCast(org.apache.hudi.TypeUtils.unsafeCast) HoodieVirtualKeyInfo(org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Set(java.util.Set) HoodieRealtimeFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HashSet(java.util.HashSet) Pair(org.apache.hudi.common.util.collection.Pair)

Example 29 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class CleanPlanner method getFilesToCleanKeepingLatestVersions.

/**
 * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
 * policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
 * single file (i.e run it with versionsRetained = 1)
 */
private List<CleanFileInfo> getFilesToCleanKeepingLatestVersions(String partitionPath) {
    LOG.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained() + " file versions. ");
    List<CleanFileInfo> deletePaths = new ArrayList<>();
    // Collect all the datafiles savepointed by all the savepoints
    List<String> savepointedFiles = hoodieTable.getSavepoints().stream().flatMap(this::getSavepointedDataFiles).collect(Collectors.toList());
    // In this scenario, we will assume that once replaced a file group automatically becomes eligible for cleaning completely
    // In other words, the file versions only apply to the active file groups.
    deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, Option.empty()));
    List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
    for (HoodieFileGroup fileGroup : fileGroups) {
        int keepVersions = config.getCleanerFileVersionsRetained();
        // do not cleanup slice required for pending compaction
        Iterator<FileSlice> fileSliceIterator = fileGroup.getAllFileSlices().filter(fs -> !isFileSliceNeededForPendingCompaction(fs)).iterator();
        if (isFileGroupInPendingCompaction(fileGroup)) {
            // We have already saved the last version of file-groups for pending compaction Id
            keepVersions--;
        }
        while (fileSliceIterator.hasNext() && keepVersions > 0) {
            // Skip this most recent version
            FileSlice nextSlice = fileSliceIterator.next();
            Option<HoodieBaseFile> dataFile = nextSlice.getBaseFile();
            if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) {
                // do not clean up a savepoint data file
                continue;
            }
            keepVersions--;
        }
        // Delete the remaining files
        while (fileSliceIterator.hasNext()) {
            FileSlice nextSlice = fileSliceIterator.next();
            deletePaths.addAll(getCleanFileInfoForSlice(nextSlice));
        }
    }
    return deletePaths;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ZonedDateTime(java.time.ZonedDateTime) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CleanPlanV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler) ArrayList(java.util.ArrayList) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Map(java.util.Map) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieSavepointException(org.apache.hudi.exception.HoodieSavepointException) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Stream(java.util.stream.Stream) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 30 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class CleanPlanner method getFilesToCleanKeepingLatestCommits.

/**
 * Selects the versions for file for cleaning, such that it
 * <p>
 * - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
 * has occurred in last <code>config.getCleanerCommitsRetained()</code> commits - It leaves ONE commit before this
 * window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
 * This is 5 hours by default (assuming ingestion is running every 30 minutes). This is essential to leave the file
 * used by the query that is running for the max time.
 * <p>
 * This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
 * retain 10 commits, and commit batch time is 30 mins, then you have 5 hrs of lookback)
 * <p>
 * This policy is the default.
 */
private List<CleanFileInfo> getFilesToCleanKeepingLatestCommits(String partitionPath, int commitsRetained, HoodieCleaningPolicy policy) {
    LOG.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
    List<CleanFileInfo> deletePaths = new ArrayList<>();
    // Collect all the datafiles savepointed by all the savepoints
    List<String> savepointedFiles = hoodieTable.getSavepoints().stream().flatMap(this::getSavepointedDataFiles).collect(Collectors.toList());
    // determine if we have enough commits, to start cleaning.
    if (commitTimeline.countInstants() > commitsRetained) {
        Option<HoodieInstant> earliestCommitToRetainOption = getEarliestCommitToRetain();
        HoodieInstant earliestCommitToRetain = earliestCommitToRetainOption.get();
        // all replaced file groups before earliestCommitToRetain are eligible to clean
        deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, earliestCommitToRetainOption));
        // add active files
        List<HoodieFileGroup> fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList());
        for (HoodieFileGroup fileGroup : fileGroups) {
            List<FileSlice> fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
            if (fileSliceList.isEmpty()) {
                continue;
            }
            String lastVersion = fileSliceList.get(0).getBaseInstantTime();
            String lastVersionBeforeEarliestCommitToRetain = getLatestVersionBeforeCommit(fileSliceList, earliestCommitToRetain);
            // i.e always spare the last commit.
            for (FileSlice aSlice : fileSliceList) {
                Option<HoodieBaseFile> aFile = aSlice.getBaseFile();
                String fileCommitTime = aSlice.getBaseInstantTime();
                if (aFile.isPresent() && savepointedFiles.contains(aFile.get().getFileName())) {
                    // do not clean up a savepoint data file
                    continue;
                }
                if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) {
                    // uses this file.
                    if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) {
                        // move on to the next file
                        continue;
                    }
                } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) {
                    // Do not delete the latest commit.
                    if (fileCommitTime.equals(lastVersion)) {
                        // move on to the next file
                        continue;
                    }
                }
                // Always keep the last commit
                if (!isFileSliceNeededForPendingCompaction(aSlice) && HoodieTimeline.compareTimestamps(earliestCommitToRetain.getTimestamp(), HoodieTimeline.GREATER_THAN, fileCommitTime)) {
                    // this is a commit, that should be cleaned.
                    aFile.ifPresent(hoodieDataFile -> {
                        deletePaths.add(new CleanFileInfo(hoodieDataFile.getPath(), false));
                        if (hoodieDataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) {
                            deletePaths.add(new CleanFileInfo(hoodieDataFile.getBootstrapBaseFile().get().getPath(), true));
                        }
                    });
                    if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
                        // If merge on read, then clean the log files for the commits as well
                        deletePaths.addAll(aSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)).collect(Collectors.toList()));
                    }
                }
            }
        }
    }
    return deletePaths;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20