Search in sources :

Example 51 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class DFSHoodieDatasetInputReader method getSingleSliceFromRDD.

private FileSlice getSingleSliceFromRDD(JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice) {
    return partitionToFileSlice.map(f -> {
        FileSlice slice = f._2.next();
        FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime());
        if (slice.getBaseFile().isPresent()) {
            newSlice.setBaseFile(slice.getBaseFile().get());
        } else {
            slice.getLogFiles().forEach(l -> {
                newSlice.addLogFile(l);
            });
        }
        return newSlice;
    }).take(1).get(0);
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice)

Example 52 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TimelineServerPerf method runOneRound.

private static PerfStats runOneRound(SyncableFileSystemView fsView, String partition, String fileId, int id, int numIterations) {
    Histogram latencyHistogram = new Histogram(new UniformReservoir(10000));
    for (int i = 0; i < numIterations; i++) {
        long beginTs = System.currentTimeMillis();
        Option<FileSlice> c = fsView.getLatestFileSlice(partition, fileId);
        long endTs = System.currentTimeMillis();
        System.out.println("Latest File Slice for part=" + partition + ", fileId=" + fileId + ", Slice=" + c + ", Time=" + (endTs - beginTs));
        latencyHistogram.update(endTs - beginTs);
    }
    return new PerfStats(partition, id, latencyHistogram.getSnapshot());
}
Also used : Histogram(com.codahale.metrics.Histogram) FileSlice(org.apache.hudi.common.model.FileSlice) UniformReservoir(com.codahale.metrics.UniformReservoir)

Example 53 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class CompactionAdminClient method getRenamingActionsForUnschedulingCompactionOperation.

/**
 * Generate renaming actions for unscheduling a compaction operation NOTE: Can only be used safely when no writer
 * (ingestion/compaction) is running.
 *
 * @param metaClient Hoodie Table MetaClient
 * @param compactionInstant Compaction Instant
 * @param operation Compaction Operation
 * @param fsViewOpt Cached File System View
 * @param skipValidation Skip Validation
 * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
 *         compaction.
 */
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
    List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
    HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
    if (!skipValidation) {
        validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
    }
    HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
    FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()).filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
    List<HoodieLogFile> logFilesToRepair = merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)).sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
    FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true).filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
    int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
    String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension()).orElse(HoodieLogFile.DELTA_EXTENSION);
    String parentPath = fileSliceForCompaction.getBaseFile().map(df -> new Path(df.getPath()).getParent().toString()).orElse(fileSliceForCompaction.getLogFiles().findFirst().map(lf -> lf.getPath().getParent().toString()).get());
    for (HoodieLogFile toRepair : logFilesToRepair) {
        int version = maxUsedVersion + 1;
        HoodieLogFile newLf = new HoodieLogFile(new Path(parentPath, FSUtils.makeLogFileName(operation.getFileId(), logExtn, operation.getBaseInstantTime(), version, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
        result.add(Pair.of(toRepair, newLf));
        maxUsedVersion = version;
    }
    return result;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) OperationResult(org.apache.hudi.table.action.compact.OperationResult) FileStatus(org.apache.hadoop.fs.FileStatus) COMPACTION_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Pair(org.apache.hudi.common.util.collection.Pair)

Example 54 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class FileSliceMetricUtils method addFileSliceCommonMetrics.

public static void addFileSliceCommonMetrics(List<FileSlice> fileSlices, Map<String, Double> metrics, long defaultBaseFileSize) {
    int numLogFiles = 0;
    long totalLogFileSize = 0;
    long totalIORead = 0;
    long totalIOWrite = 0;
    long totalIO = 0;
    for (FileSlice slice : fileSlices) {
        numLogFiles += slice.getLogFiles().count();
        // Total size of all the log files
        totalLogFileSize += slice.getLogFiles().map(HoodieLogFile::getFileSize).filter(size -> size >= 0).reduce(Long::sum).orElse(0L);
        long baseFileSize = slice.getBaseFile().isPresent() ? slice.getBaseFile().get().getFileSize() : 0L;
        totalIORead += baseFileSize;
        // Total write will be similar to the size of the base file
        totalIOWrite += baseFileSize > 0 ? baseFileSize : defaultBaseFileSize;
    }
    // Total read will be the base file + all the log files
    totalIORead = FSUtils.getSizeInMB(totalIORead + totalLogFileSize);
    totalIOWrite = FSUtils.getSizeInMB(totalIOWrite);
    // Total IO will be the IO for read + write
    totalIO = totalIORead + totalIOWrite;
    metrics.put(TOTAL_IO_READ_MB, (double) totalIORead);
    metrics.put(TOTAL_IO_WRITE_MB, (double) totalIOWrite);
    metrics.put(TOTAL_IO_MB, (double) totalIO);
    metrics.put(TOTAL_LOG_FILE_SIZE, (double) totalLogFileSize);
    metrics.put(TOTAL_LOG_FILES, (double) numLogFiles);
}
Also used : List(java.util.List) Map(java.util.Map) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) FSUtils(org.apache.hudi.common.fs.FSUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 55 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class IncrementalTimelineSyncFileSystemView method applyDeltaFileSlicesToPartitionView.

/**
 * Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
 * some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
 * RocksDB), override this method.
 *
 * @param partition PartitionPath
 * @param deltaFileGroups Changed file-slices aggregated as file-groups
 * @param mode Delta Apply mode
 */
protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
    if (deltaFileGroups.isEmpty()) {
        LOG.info("No delta file groups for partition :" + partition);
        return;
    }
    List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
    /**
     * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the
     * base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
     */
    Map<String, HoodieBaseFile> viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
    Map<String, HoodieBaseFile> deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    switch(mode) {
        case ADD:
            viewDataFiles.putAll(deltaDataFiles);
            viewLogFiles.putAll(deltaLogFiles);
            break;
        case REMOVE:
            deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
            deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
            break;
        default:
            throw new IllegalStateException("Unknown diff apply mode=" + mode);
    }
    HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
    List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true);
    storePartitionView(partition, fgs);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) TimelineDiffHelper(org.apache.hudi.common.table.timeline.TimelineDiffHelper) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) TimelineDiffResult(org.apache.hudi.common.table.timeline.TimelineDiffHelper.TimelineDiffResult) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Option(org.apache.hudi.common.util.Option) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Aggregations

FileSlice (org.apache.hudi.common.model.FileSlice)87 List (java.util.List)51 ArrayList (java.util.ArrayList)45 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)45 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)43 IOException (java.io.IOException)39 Path (org.apache.hadoop.fs.Path)39 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)39 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)38 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)38 Option (org.apache.hudi.common.util.Option)37 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)36 Pair (org.apache.hudi.common.util.collection.Pair)35 HashMap (java.util.HashMap)32 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)32 FSUtils (org.apache.hudi.common.fs.FSUtils)29 Stream (java.util.stream.Stream)28 Test (org.junit.jupiter.api.Test)27 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)26