Search in sources :

Example 16 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class CompactionAdminClient method getRenamingActionsToAlignWithCompactionOperation.

/**
 * Get Renaming actions to ensure the log-files of merged file-slices is aligned with compaction operation. This
 * method is used to recover from failures during unschedule compaction operations.
 *
 * @param metaClient Hoodie Table Meta Client
 * @param compactionInstant Compaction Instant
 * @param op Compaction Operation
 * @param fsViewOpt File System View
 */
protected static List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsToAlignWithCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation op, Option<HoodieTableFileSystemView> fsViewOpt) {
    HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
    HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
    FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp()).filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get();
    final int maxVersion = op.getDeltaFileNames().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf))).reduce((x, y) -> x > y ? x : y).orElse(0);
    List<HoodieLogFile> logFilesToBeMoved = merged.getLogFiles().filter(lf -> lf.getLogVersion() > maxVersion).collect(Collectors.toList());
    return logFilesToBeMoved.stream().map(lf -> {
        ValidationUtils.checkArgument(lf.getLogVersion() - maxVersion > 0, "Expect new log version to be sane");
        HoodieLogFile newLogFile = new HoodieLogFile(new Path(lf.getPath().getParent(), FSUtils.makeLogFileName(lf.getFileId(), "." + FSUtils.getFileExtensionFromLog(lf.getPath()), compactionInstant, lf.getLogVersion() - maxVersion, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
        return Pair.of(lf, newLogFile);
    }).collect(Collectors.toList());
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) OperationResult(org.apache.hudi.table.action.compact.OperationResult) FileStatus(org.apache.hadoop.fs.FileStatus) COMPACTION_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 17 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class AbstractTableFileSystemView method getLatestUnCompactedFileSlices.

@Override
public final Stream<FileSlice> getLatestUnCompactedFileSlices(String partitionStr) {
    try {
        readLock.lock();
        String partitionPath = formatPartitionKey(partitionStr);
        ensurePartitionLoadedCorrectly(partitionPath);
        return fetchAllStoredFileGroups(partitionPath).filter(fg -> !isFileGroupReplaced(fg.getFileGroupId())).map(fileGroup -> {
            FileSlice fileSlice = fileGroup.getLatestFileSlice().get();
            // if the file-group is under compaction, pick the latest before compaction instant time.
            Option<Pair<String, CompactionOperation>> compactionWithInstantPair = getPendingCompactionOperationWithInstant(fileSlice.getFileGroupId());
            if (compactionWithInstantPair.isPresent()) {
                String compactionInstantTime = compactionWithInstantPair.get().getLeft();
                return fileGroup.getLatestFileSliceBefore(compactionInstantTime);
            }
            return Option.of(fileSlice);
        }).map(Option::get).map(this::addBootstrapBaseFileIfPresent);
    } finally {
        readLock.unlock();
    }
}
Also used : BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) ReadLock(java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) WriteLock(java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock) Predicate(java.util.function.Predicate) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) AbstractMap(java.util.AbstractMap) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) METADATA_BOOTSTRAP_INSTANT_TS(org.apache.hudi.common.table.timeline.HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileSlice(org.apache.hudi.common.model.FileSlice) Pair(org.apache.hudi.common.util.collection.Pair)

Example 18 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class AbstractTableFileSystemView method filterBaseFileAfterPendingCompaction.

/**
 * With async compaction, it is possible to see partial/complete base-files due to inflight-compactions, Ignore those
 * base-files.
 *
 * @param fileSlice File Slice
 */
protected FileSlice filterBaseFileAfterPendingCompaction(FileSlice fileSlice) {
    if (isFileSliceAfterPendingCompaction(fileSlice)) {
        LOG.debug("File Slice (" + fileSlice + ") is in pending compaction");
        // Base file is filtered out of the file-slice as the corresponding compaction
        // instant not completed yet.
        FileSlice transformed = new FileSlice(fileSlice.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId());
        fileSlice.getLogFiles().forEach(transformed::addLogFile);
        return transformed;
    }
    return fileSlice;
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice)

Example 19 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class ClusteringUtils method buildMetrics.

private static Map<String, Double> buildMetrics(List<FileSlice> fileSlices) {
    int numLogFiles = 0;
    long totalLogFileSize = 0;
    long totalIORead = 0;
    for (FileSlice slice : fileSlices) {
        numLogFiles += slice.getLogFiles().count();
        // Total size of all the log files
        totalLogFileSize += slice.getLogFiles().map(HoodieLogFile::getFileSize).filter(size -> size >= 0).reduce(Long::sum).orElse(0L);
        // Total read will be the base file + all the log files
        totalIORead = FSUtils.getSizeInMB((slice.getBaseFile().isPresent() ? slice.getBaseFile().get().getFileSize() : 0L) + totalLogFileSize);
    }
    Map<String, Double> metrics = new HashMap<>();
    metrics.put(TOTAL_IO_READ_MB, (double) totalIORead);
    metrics.put(TOTAL_LOG_FILE_SIZE, (double) totalLogFileSize);
    metrics.put(TOTAL_LOG_FILES, (double) numLogFiles);
    return metrics;
}
Also used : Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) HashMap(java.util.HashMap) Logger(org.apache.log4j.Logger) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) AbstractMap(java.util.AbstractMap) List(java.util.List) Stream(java.util.stream.Stream) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 20 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class BootstrapOperator method loadRecords.

/**
 * Loads all the indices of give partition path into the backup state.
 *
 * @param partitionPath The partition path
 */
@SuppressWarnings("unchecked")
protected void loadRecords(String partitionPath) throws Exception {
    long start = System.currentTimeMillis();
    final int parallelism = getRuntimeContext().getNumberOfParallelSubtasks();
    final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks();
    final int taskID = getRuntimeContext().getIndexOfThisSubtask();
    HoodieTimeline commitsTimeline = this.hoodieTable.getMetaClient().getCommitsTimeline();
    if (!StringUtils.isNullOrEmpty(lastInstantTime)) {
        commitsTimeline = commitsTimeline.findInstantsAfter(lastInstantTime);
    }
    Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant();
    if (latestCommitTime.isPresent()) {
        BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
        Schema schema = new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema();
        List<FileSlice> fileSlices = this.hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp(), true).collect(toList());
        for (FileSlice fileSlice : fileSlices) {
            if (!shouldLoadFile(fileSlice.getFileId(), maxParallelism, parallelism, taskID)) {
                continue;
            }
            LOG.info("Load records from {}.", fileSlice);
            // load parquet records
            fileSlice.getBaseFile().ifPresent(baseFile -> {
                // filter out crushed files
                if (!isValidFile(baseFile.getFileStatus())) {
                    return;
                }
                try (ClosableIterator<HoodieKey> iterator = fileUtils.getHoodieKeyIterator(this.hadoopConf, new Path(baseFile.getPath()))) {
                    iterator.forEachRemaining(hoodieKey -> {
                        output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice))));
                    });
                }
            });
            // load avro log records
            List<String> logPaths = fileSlice.getLogFiles().filter(logFile -> isValidFile(logFile.getFileStatus())).map(logFile -> logFile.getPath().toString()).collect(toList());
            HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(), writeConfig, hadoopConf);
            try {
                for (String recordKey : scanner.getRecords().keySet()) {
                    output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice))));
                }
            } catch (Exception e) {
                throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
            } finally {
                scanner.close();
            }
        }
    }
    long cost = System.currentTimeMillis() - start;
    LOG.info("Task [{}}:{}}] finish loading the index under partition {} and sending them to downstream, time cost: {} milliseconds.", this.getClass().getSimpleName(), taskID, partitionPath, cost);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) BootstrapAggFunction(org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction) CkpMetadata(org.apache.hudi.sink.meta.CkpMetadata) ListState(org.apache.flink.api.common.state.ListState) StringUtils(org.apache.hudi.common.util.StringUtils) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) FlinkTables(org.apache.hudi.util.FlinkTables) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) Path(org.apache.hadoop.fs.Path) StreamerUtil(org.apache.hudi.util.StreamerUtil) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Types(org.apache.flink.api.common.typeinfo.Types) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) StateSnapshotContext(org.apache.flink.runtime.state.StateSnapshotContext) HoodieRecordGlobalLocation(org.apache.hudi.common.model.HoodieRecordGlobalLocation) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Configuration(org.apache.flink.configuration.Configuration) StreamerUtil.isValidFile(org.apache.hudi.util.StreamerUtil.isValidFile) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AbstractStreamOperator(org.apache.flink.streaming.api.operators.AbstractStreamOperator) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) GlobalAggregateManager(org.apache.flink.runtime.taskexecutor.GlobalAggregateManager) HoodieKey(org.apache.hudi.common.model.HoodieKey) Pattern(java.util.regex.Pattern) OneInputStreamOperator(org.apache.flink.streaming.api.operators.OneInputStreamOperator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) FormatUtils(org.apache.hudi.table.format.FormatUtils) KeyGroupRangeAssignment(org.apache.flink.runtime.state.KeyGroupRangeAssignment) StateInitializationContext(org.apache.flink.runtime.state.StateInitializationContext) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils)

Aggregations

FileSlice (org.apache.hudi.common.model.FileSlice)87 List (java.util.List)51 ArrayList (java.util.ArrayList)45 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)45 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)43 IOException (java.io.IOException)39 Path (org.apache.hadoop.fs.Path)39 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)39 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)38 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)38 Option (org.apache.hudi.common.util.Option)37 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)36 Pair (org.apache.hudi.common.util.collection.Pair)35 HashMap (java.util.HashMap)32 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)32 FSUtils (org.apache.hudi.common.fs.FSUtils)29 Stream (java.util.stream.Stream)28 Test (org.junit.jupiter.api.Test)27 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)26