Search in sources :

Example 16 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class DirectWriteMarkers method createdAndMergedDataPaths.

@Override
public Set<String> createdAndMergedDataPaths(HoodieEngineContext context, int parallelism) throws IOException {
    Set<String> dataFiles = new HashSet<>();
    FileStatus[] topLevelStatuses = fs.listStatus(markerDirPath);
    List<String> subDirectories = new ArrayList<>();
    for (FileStatus topLevelStatus : topLevelStatuses) {
        if (topLevelStatus.isFile()) {
            String pathStr = topLevelStatus.getPath().toString();
            if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) {
                dataFiles.add(translateMarkerToDataPath(pathStr));
            }
        } else {
            subDirectories.add(topLevelStatus.getPath().toString());
        }
    }
    if (subDirectories.size() > 0) {
        parallelism = Math.min(subDirectories.size(), parallelism);
        SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
        context.setJobStatus(this.getClass().getSimpleName(), "Obtaining marker files for all created, merged paths");
        dataFiles.addAll(context.flatMap(subDirectories, directory -> {
            Path path = new Path(directory);
            FileSystem fileSystem = path.getFileSystem(serializedConf.get());
            RemoteIterator<LocatedFileStatus> itr = fileSystem.listFiles(path, true);
            List<String> result = new ArrayList<>();
            while (itr.hasNext()) {
                FileStatus status = itr.next();
                String pathStr = status.getPath().toString();
                if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) {
                    result.add(translateMarkerToDataPath(pathStr));
                }
            }
            return result.stream();
        }, parallelism));
    }
    return dataFiles;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) IOType(org.apache.hudi.common.model.IOType) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) MarkerUtils(org.apache.hudi.common.util.MarkerUtils) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) List(java.util.List) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Path(org.apache.hadoop.fs.Path) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) FSUtils(org.apache.hudi.common.fs.FSUtils) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Example 17 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieTable method reconcileAgainstMarkers.

/**
 * Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark
 * retries.
 *
 * @param context HoodieEngineContext
 * @param instantTs Instant Timestamp
 * @param stats Hoodie Write Stat
 * @param consistencyCheckEnabled Consistency Check Enabled
 * @throws HoodieIOException
 */
protected void reconcileAgainstMarkers(HoodieEngineContext context, String instantTs, List<HoodieWriteStat> stats, boolean consistencyCheckEnabled) throws HoodieIOException {
    try {
        // Reconcile marker and data files with WriteStats so that partially written data-files due to failed
        // (but succeeded on retry) tasks are removed.
        String basePath = getMetaClient().getBasePath();
        WriteMarkers markers = WriteMarkersFactory.get(config.getMarkersType(), this, instantTs);
        if (!markers.doesMarkerDirExist()) {
            // can happen if it was an empty write say.
            return;
        }
        // we are not including log appends here, since they are already fail-safe.
        Set<String> invalidDataPaths = getInvalidDataPaths(markers);
        Set<String> validDataPaths = stats.stream().map(HoodieWriteStat::getPath).filter(p -> p.endsWith(this.getBaseFileExtension())).collect(Collectors.toSet());
        // Contains list of partially created files. These needs to be cleaned up.
        invalidDataPaths.removeAll(validDataPaths);
        if (!invalidDataPaths.isEmpty()) {
            LOG.info("Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
            Map<String, List<Pair<String, String>>> invalidPathsByPartition = invalidDataPaths.stream().map(dp -> Pair.of(new Path(basePath, dp).getParent().toString(), new Path(basePath, dp).toString())).collect(Collectors.groupingBy(Pair::getKey));
            // Otherwise, we may miss deleting such files. If files are not found even after retries, fail the commit
            if (consistencyCheckEnabled) {
                // This will either ensure all files to be deleted are present.
                waitForAllFiles(context, invalidPathsByPartition, FileVisibility.APPEAR);
            }
            // Now delete partially written files
            context.setJobStatus(this.getClass().getSimpleName(), "Delete all partially written files");
            deleteInvalidFilesByPartitions(context, invalidPathsByPartition);
            // Now ensure the deleted files disappear
            if (consistencyCheckEnabled) {
                // This will either ensure all files to be deleted are absent.
                waitForAllFiles(context, invalidPathsByPartition, FileVisibility.DISAPPEAR);
            }
        }
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodiePendingRollbackInfo(org.apache.hudi.common.HoodiePendingRollbackInfo) ConsistencyGuard(org.apache.hudi.common.fs.ConsistencyGuard) TimeoutException(java.util.concurrent.TimeoutException) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLayoutFactory(org.apache.hudi.table.storage.HoodieLayoutFactory) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) Schema(org.apache.avro.Schema) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) FileSystemViewManager(org.apache.hudi.common.table.view.FileSystemViewManager) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) OptimisticConsistencyGuard(org.apache.hudi.common.fs.OptimisticConsistencyGuard) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieStorageLayout(org.apache.hudi.table.storage.HoodieStorageLayout) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) HoodieBootstrapWriteMetadata(org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileVisibility(org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Function(java.util.function.Function) FailSafeConsistencyGuard(org.apache.hudi.common.fs.FailSafeConsistencyGuard) ArrayList(java.util.ArrayList) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) SpecificRecordBase(org.apache.avro.specific.SpecificRecordBase) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieKey(org.apache.hudi.common.model.HoodieKey) Functions(org.apache.hudi.common.util.Functions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) List(java.util.List) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 18 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class RollbackUtils method generateRollbackRequestsUsingFileListingMOR.

/**
 * Generate all rollback requests that we need to perform for rolling back this action without actually performing rolling back for MOR table type.
 *
 * @param instantToRollback Instant to Rollback
 * @param table instance of {@link HoodieTable} to use.
 * @param context instance of {@link HoodieEngineContext} to use.
 * @return list of rollback requests
 */
public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException {
    String commit = instantToRollback.getTimestamp();
    HoodieWriteConfig config = table.getConfig();
    List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getBasePath(), false, false);
    if (partitions.isEmpty()) {
        return new ArrayList<>();
    }
    int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
    context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests");
    return context.flatMap(partitions, partitionPath -> {
        HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline();
        List<ListingBasedRollbackRequest> partitionRollbackRequests = new ArrayList<>();
        switch(instantToRollback.getAction()) {
            case HoodieTimeline.COMMIT_ACTION:
            case HoodieTimeline.REPLACE_COMMIT_ACTION:
                LOG.info("Rolling back commit action.");
                partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
                break;
            case HoodieTimeline.COMPACTION_ACTION:
                // If there is no delta commit present after the current commit (if compaction), no action, else we
                // need to make sure that a compaction commit rollback also deletes any log files written as part of the
                // succeeding deltacommit.
                boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants().findInstantsAfter(commit, 1).empty();
                if (higherDeltaCommits) {
                    // Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled
                    // and has not yet finished. In this scenario we should delete only the newly created base files
                    // and not corresponding base commit log files created with this as baseCommit since updates would
                    // have been written to the log files.
                    LOG.info("Rolling back compaction. There are higher delta commits. So only deleting data files");
                    partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataFilesOnlyAction(partitionPath));
                } else {
                    // No deltacommits present after this compaction commit (inflight or requested). In this case, we
                    // can also delete any log files that were created with this compaction commit as base
                    // commit.
                    LOG.info("Rolling back compaction plan. There are NO higher delta commits. So deleting both data and" + " log files");
                    partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
                }
                break;
            case HoodieTimeline.DELTA_COMMIT_ACTION:
                // --------------------------------------------------------------------------------------------------
                // (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
                // --------------------------------------------------------------------------------------------------
                // (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In
                // this scenario we would want to delete these log files.
                // (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
                // HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
                // (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
                // being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and
                // and hence will end up deleting these log files. This is done so there are no orphan log files
                // lying around.
                // (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
                // taken in this scenario is a combination of (A.2) and (A.3)
                // ---------------------------------------------------------------------------------------------------
                // (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
                // ---------------------------------------------------------------------------------------------------
                // (B.1) Failed first commit - Inserts were written to base files and HoodieWriteStat has no entries.
                // In this scenario, we delete all the base files written for the failed commit.
                // (B.2) Failed recurring commits - Inserts were written to base files and updates to log files. In
                // this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
                // (B.3) Rollback triggered for first commit - Same as (B.1)
                // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
                // as well if the base base file gets deleted.
                HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(instantToRollback).get(), HoodieCommitMetadata.class);
                // In case all data was inserts and the commit failed, delete the file belonging to that commit
                // We do not know fileIds for inserts (first inserts are either log files or base files),
                // delete all files for the corresponding failed commit, if present (same as COW)
                partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
                // append rollback blocks for updates and inserts as A.2 and B.2
                if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
                    partitionRollbackRequests.addAll(generateAppendRollbackBlocksAction(partitionPath, instantToRollback, commitMetadata, table));
                }
                break;
            default:
                break;
        }
        return partitionRollbackRequests.stream();
    }, Math.min(partitions.size(), sparkPartitions)).stream().filter(Objects::nonNull).collect(Collectors.toList());
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) List(java.util.List) ValidationUtils.checkArgument(org.apache.hudi.common.util.ValidationUtils.checkArgument) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) LogManager(org.apache.log4j.LogManager) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ArrayList(java.util.ArrayList) List(java.util.List)

Example 19 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class TwoToOneDowngradeHandler method deleteTimelineBasedMarkerFiles.

private void deleteTimelineBasedMarkerFiles(HoodieEngineContext context, String markerDir, FileSystem fileSystem, int parallelism) throws IOException {
    // Deletes timeline based marker files if any.
    Predicate<FileStatus> prefixFilter = fileStatus -> fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX);
    FSUtils.parallelizeSubPathProcess(context, fileSystem, new Path(markerDir), parallelism, prefixFilter, pairOfSubPathAndConf -> FSUtils.deleteSubPath(pairOfSubPathAndConf.getKey(), pairOfSubPathAndConf.getValue(), false));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) MarkerType(org.apache.hudi.common.table.marker.MarkerType) MARKERS_FILENAME_PREFIX(org.apache.hudi.common.util.MarkerUtils.MARKERS_FILENAME_PREFIX) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) DirectWriteMarkers(org.apache.hudi.table.marker.DirectWriteMarkers) Path(org.apache.hadoop.fs.Path) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Predicate(java.util.function.Predicate) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) MarkerUtils(org.apache.hudi.common.util.MarkerUtils) Collectors(java.util.stream.Collectors) List(java.util.List) ConfigProperty(org.apache.hudi.common.config.ConfigProperty) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus)

Example 20 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieCompactor method generateCompactionPlan.

/**
 * Generate a new compaction plan for scheduling.
 *
 * @param context                               HoodieEngineContext
 * @param hoodieTable                           Hoodie Table
 * @param config                                Hoodie Write Configuration
 * @param compactionCommitTime                  scheduled compaction commit time
 * @param fgIdsInPendingCompactionAndClustering partition-fileId pairs for which compaction is pending
 * @return Compaction Plan
 * @throws IOException when encountering errors
 */
HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, HoodieTable<T, I, K, O> hoodieTable, HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactionAndClustering) throws IOException {
    // Accumulator to keep track of total log files for a table
    HoodieAccumulator totalLogFiles = context.newAccumulator();
    // Accumulator to keep track of total log file slices for a table
    HoodieAccumulator totalFileSlices = context.newAccumulator();
    ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient().getTableType().name());
    // TODO : check if maxMemory is not greater than JVM or executor memory
    // TODO - rollback any compactions in flight
    HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
    LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
    List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath());
    // filter the partition paths if needed to reduce list status
    partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
    if (partitionPaths.isEmpty()) {
        // In case no partitions could be picked, return no compaction plan
        return null;
    }
    SliceView fileSystemView = hoodieTable.getSliceView();
    LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
    context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact");
    List<HoodieCompactionOperation> operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView.getLatestFileSlices(partitionPath).filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())).map(s -> {
        List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(toList());
        totalLogFiles.add(logFiles.size());
        totalFileSlices.add(1L);
        // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
        // for Map operations and collecting them finally in Avro generated classes for storing
        // into meta files.
        Option<HoodieBaseFile> dataFile = s.getBaseFile();
        return new CompactionOperation(dataFile, partitionPath, logFiles, config.getCompactionStrategy().captureMetrics(config, s));
    }).filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
    LOG.info("Total of " + operations.size() + " compactions are retrieved");
    LOG.info("Total number of latest files slices " + totalFileSlices.value());
    LOG.info("Total number of log files " + totalLogFiles.value());
    LOG.info("Total number of file slices " + totalFileSlices.value());
    // Filter the compactions with the passed in filter. This lets us choose most effective
    // compactions only
    HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
    ValidationUtils.checkArgument(compactionPlan.getOperations().stream().noneMatch(op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering + ", Selected workload :" + compactionPlan);
    if (compactionPlan.getOperations().isEmpty()) {
        LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
    }
    return compactionPlan;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) Option(org.apache.hudi.common.util.Option) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)36 List (java.util.List)29 ArrayList (java.util.ArrayList)27 IOException (java.io.IOException)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)23 Collectors (java.util.stream.Collectors)23 Path (org.apache.hadoop.fs.Path)23 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Option (org.apache.hudi.common.util.Option)23 FileSystem (org.apache.hadoop.fs.FileSystem)21 Pair (org.apache.hudi.common.util.collection.Pair)19 FSUtils (org.apache.hudi.common.fs.FSUtils)18 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)18 HoodieIOException (org.apache.hudi.exception.HoodieIOException)18 HashMap (java.util.HashMap)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieTable (org.apache.hudi.table.HoodieTable)15 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)14