Search in sources :

Example 16 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class FileGroupDTO method toFileGroup.

public static HoodieFileGroup toFileGroup(FileGroupDTO dto, HoodieTableMetaClient metaClient) {
    HoodieFileGroup fileGroup = new HoodieFileGroup(dto.partition, dto.id, TimelineDTO.toTimeline(dto.timeline, metaClient));
    dto.slices.stream().map(FileSliceDTO::toFileSlice).forEach(fileSlice -> fileGroup.addFileSlice(fileSlice));
    return fileGroup;
}
Also used : HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 17 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class IncrementalTimelineSyncFileSystemView method updatePartitionWriteFileGroups.

private void updatePartitionWriteFileGroups(Map<String, List<HoodieWriteStat>> partitionToWriteStats, HoodieTimeline timeline, HoodieInstant instant) {
    partitionToWriteStats.entrySet().stream().forEach(entry -> {
        String partition = entry.getKey();
        if (isPartitionAvailableInStore(partition)) {
            LOG.info("Syncing partition (" + partition + ") of instant (" + instant + ")");
            FileStatus[] statuses = entry.getValue().stream().map(p -> {
                FileStatus status = new FileStatus(p.getFileSizeInBytes(), false, 0, 0, 0, 0, null, null, null, new Path(String.format("%s/%s", metaClient.getBasePath(), p.getPath())));
                return status;
            }).toArray(FileStatus[]::new);
            List<HoodieFileGroup> fileGroups = buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), false);
            applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.ADD);
        } else {
            LOG.warn("Skipping partition (" + partition + ") when syncing instant (" + instant + ") as it is not loaded");
        }
    });
    LOG.info("Done Syncing committed instant (" + instant + ")");
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) TimelineDiffHelper(org.apache.hudi.common.table.timeline.TimelineDiffHelper) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) TimelineDiffResult(org.apache.hudi.common.table.timeline.TimelineDiffHelper.TimelineDiffResult) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 18 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class IncrementalTimelineSyncFileSystemView method applyDeltaFileSlicesToPartitionView.

/**
 * Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
 * some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
 * RocksDB), override this method.
 *
 * @param partition PartitionPath
 * @param deltaFileGroups Changed file-slices aggregated as file-groups
 * @param mode Delta Apply mode
 */
protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
    if (deltaFileGroups.isEmpty()) {
        LOG.info("No delta file groups for partition :" + partition);
        return;
    }
    List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
    /**
     * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the
     * base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
     */
    Map<String, HoodieBaseFile> viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
    Map<String, HoodieBaseFile> deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    switch(mode) {
        case ADD:
            viewDataFiles.putAll(deltaDataFiles);
            viewLogFiles.putAll(deltaLogFiles);
            break;
        case REMOVE:
            deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
            deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
            break;
        default:
            throw new IllegalStateException("Unknown diff apply mode=" + mode);
    }
    HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
    List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true);
    storePartitionView(partition, fgs);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) TimelineDiffHelper(org.apache.hudi.common.table.timeline.TimelineDiffHelper) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) TimelineDiffResult(org.apache.hudi.common.table.timeline.TimelineDiffHelper.TimelineDiffResult) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Option(org.apache.hudi.common.util.Option) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 19 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class RocksDbBasedFileSystemView method applyDeltaFileSlicesToPartitionView.

@Override
protected /*
   * This is overridden to incrementally apply file-slices to rocks DB
   */
void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
    rocksDB.writeBatch(batch -> deltaFileGroups.forEach(fg -> fg.getAllRawFileSlices().map(fs -> {
        FileSlice oldSlice = getFileSlice(partition, fs.getFileId(), fs.getBaseInstantTime());
        if (null == oldSlice) {
            return fs;
        } else {
            // First remove the file-slice
            LOG.info("Removing old Slice in DB. FS=" + oldSlice);
            rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, oldSlice));
            rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, oldSlice));
            Map<String, HoodieLogFile> logFiles = oldSlice.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
            Map<String, HoodieLogFile> deltaLogFiles = fs.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
            switch(mode) {
                case ADD:
                    {
                        FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
                        oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
                        fs.getBaseFile().ifPresent(newFileSlice::setBaseFile);
                        Map<String, HoodieLogFile> newLogFiles = new HashMap<>(logFiles);
                        deltaLogFiles.entrySet().stream().filter(e -> !logFiles.containsKey(e.getKey())).forEach(p -> newLogFiles.put(p.getKey(), p.getValue()));
                        newLogFiles.values().forEach(newFileSlice::addLogFile);
                        LOG.info("Adding back new File Slice after add FS=" + newFileSlice);
                        return newFileSlice;
                    }
                case REMOVE:
                    {
                        LOG.info("Removing old File Slice =" + fs);
                        FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
                        fs.getBaseFile().orElseGet(() -> {
                            oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
                            return null;
                        });
                        deltaLogFiles.keySet().forEach(logFiles::remove);
                        // Add remaining log files back
                        logFiles.values().forEach(newFileSlice::addLogFile);
                        if (newFileSlice.getBaseFile().isPresent() || (newFileSlice.getLogFiles().count() > 0)) {
                            LOG.info("Adding back new file-slice after remove FS=" + newFileSlice);
                            return newFileSlice;
                        }
                        return null;
                    }
                default:
                    throw new IllegalStateException("Unknown diff apply mode=" + mode);
            }
        }
    }).filter(Objects::nonNull).forEach(fs -> {
        rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, fs), fs);
        fs.getBaseFile().ifPresent(df -> rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, fs), df));
    })));
}
Also used : BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) RocksDBDAO(org.apache.hudi.common.util.collection.RocksDBDAO) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) RocksDBSchemaHelper(org.apache.hudi.common.util.RocksDBSchemaHelper) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) Set(java.util.Set) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Objects(java.util.Objects) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashMap(java.util.HashMap) Map(java.util.Map)

Example 20 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class RocksDbBasedFileSystemView method getFileGroups.

private Stream<HoodieFileGroup> getFileGroups(Stream<FileSlice> sliceStream) {
    return sliceStream.map(s -> Pair.of(Pair.of(s.getPartitionPath(), s.getFileId()), s)).collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream().map(slicePair -> {
        HoodieFileGroup fg = new HoodieFileGroup(slicePair.getKey().getKey(), slicePair.getKey().getValue(), getVisibleCommitsAndCompactionTimeline());
        slicePair.getValue().forEach(e -> fg.addFileSlice(e.getValue()));
        return fg;
    });
}
Also used : HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)38 FileSlice (org.apache.hudi.common.model.FileSlice)29 IOException (java.io.IOException)27 Map (java.util.Map)27 List (java.util.List)26 Collectors (java.util.stream.Collectors)25 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)24 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)22 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)22 Option (org.apache.hudi.common.util.Option)22 Path (org.apache.hadoop.fs.Path)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)21 ArrayList (java.util.ArrayList)20 FileStatus (org.apache.hadoop.fs.FileStatus)19 Pair (org.apache.hudi.common.util.collection.Pair)19 LogManager (org.apache.log4j.LogManager)18 Logger (org.apache.log4j.Logger)18 Set (java.util.Set)17 Stream (java.util.stream.Stream)17 FSUtils (org.apache.hudi.common.fs.FSUtils)17