Search in sources :

Example 86 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class IncrementalTimelineSyncFileSystemView method applyDeltaFileSlicesToPartitionView.

/**
 * Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
 * some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
 * RocksDB), override this method.
 *
 * @param partition PartitionPath
 * @param deltaFileGroups Changed file-slices aggregated as file-groups
 * @param mode Delta Apply mode
 */
protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
    if (deltaFileGroups.isEmpty()) {
        LOG.info("No delta file groups for partition :" + partition);
        return;
    }
    List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
    /**
     * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the
     * base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
     */
    Map<String, HoodieBaseFile> viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
    Map<String, HoodieBaseFile> deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    switch(mode) {
        case ADD:
            viewDataFiles.putAll(deltaDataFiles);
            viewLogFiles.putAll(deltaLogFiles);
            break;
        case REMOVE:
            deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
            deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
            break;
        default:
            throw new IllegalStateException("Unknown diff apply mode=" + mode);
    }
    HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
    List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true);
    storePartitionView(partition, fgs);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) TimelineDiffHelper(org.apache.hudi.common.table.timeline.TimelineDiffHelper) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) TimelineDiffResult(org.apache.hudi.common.table.timeline.TimelineDiffHelper.TimelineDiffResult) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Option(org.apache.hudi.common.util.Option) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 87 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class RocksDbBasedFileSystemView method applyDeltaFileSlicesToPartitionView.

@Override
protected /*
   * This is overridden to incrementally apply file-slices to rocks DB
   */
void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
    rocksDB.writeBatch(batch -> deltaFileGroups.forEach(fg -> fg.getAllRawFileSlices().map(fs -> {
        FileSlice oldSlice = getFileSlice(partition, fs.getFileId(), fs.getBaseInstantTime());
        if (null == oldSlice) {
            return fs;
        } else {
            // First remove the file-slice
            LOG.info("Removing old Slice in DB. FS=" + oldSlice);
            rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, oldSlice));
            rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, oldSlice));
            Map<String, HoodieLogFile> logFiles = oldSlice.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
            Map<String, HoodieLogFile> deltaLogFiles = fs.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
            switch(mode) {
                case ADD:
                    {
                        FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
                        oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
                        fs.getBaseFile().ifPresent(newFileSlice::setBaseFile);
                        Map<String, HoodieLogFile> newLogFiles = new HashMap<>(logFiles);
                        deltaLogFiles.entrySet().stream().filter(e -> !logFiles.containsKey(e.getKey())).forEach(p -> newLogFiles.put(p.getKey(), p.getValue()));
                        newLogFiles.values().forEach(newFileSlice::addLogFile);
                        LOG.info("Adding back new File Slice after add FS=" + newFileSlice);
                        return newFileSlice;
                    }
                case REMOVE:
                    {
                        LOG.info("Removing old File Slice =" + fs);
                        FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
                        fs.getBaseFile().orElseGet(() -> {
                            oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
                            return null;
                        });
                        deltaLogFiles.keySet().forEach(logFiles::remove);
                        // Add remaining log files back
                        logFiles.values().forEach(newFileSlice::addLogFile);
                        if (newFileSlice.getBaseFile().isPresent() || (newFileSlice.getLogFiles().count() > 0)) {
                            LOG.info("Adding back new file-slice after remove FS=" + newFileSlice);
                            return newFileSlice;
                        }
                        return null;
                    }
                default:
                    throw new IllegalStateException("Unknown diff apply mode=" + mode);
            }
        }
    }).filter(Objects::nonNull).forEach(fs -> {
        rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, fs), fs);
        fs.getBaseFile().ifPresent(df -> rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, fs), df));
    })));
}
Also used : BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) RocksDBDAO(org.apache.hudi.common.util.collection.RocksDBDAO) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) RocksDBSchemaHelper(org.apache.hudi.common.util.RocksDBSchemaHelper) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) Set(java.util.Set) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Objects(java.util.Objects) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashMap(java.util.HashMap) Map(java.util.Map)

Example 88 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class RocksDbBasedFileSystemView method getFileGroups.

private Stream<HoodieFileGroup> getFileGroups(Stream<FileSlice> sliceStream) {
    return sliceStream.map(s -> Pair.of(Pair.of(s.getPartitionPath(), s.getFileId()), s)).collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream().map(slicePair -> {
        HoodieFileGroup fg = new HoodieFileGroup(slicePair.getKey().getKey(), slicePair.getKey().getValue(), getVisibleCommitsAndCompactionTimeline());
        slicePair.getValue().forEach(e -> fg.addFileSlice(e.getValue()));
        return fg;
    });
}
Also used : HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Pair(org.apache.hudi.common.util.collection.Pair)

Example 89 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class SpillableMapBasedFileSystemView method createFileIdToPendingCompactionMap.

@Override
protected Map<HoodieFileGroupId, Pair<String, CompactionOperation>> createFileIdToPendingCompactionMap(Map<HoodieFileGroupId, Pair<String, CompactionOperation>> fgIdToPendingCompaction) {
    try {
        LOG.info("Creating Pending Compaction map using external spillable Map. Max Mem=" + maxMemoryForPendingCompaction + ", BaseDir=" + baseStoreDir);
        new File(baseStoreDir).mkdirs();
        Map<HoodieFileGroupId, Pair<String, CompactionOperation>> pendingMap = new ExternalSpillableMap<>(maxMemoryForPendingCompaction, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), diskMapType, isBitCaskDiskMapCompressionEnabled);
        pendingMap.putAll(fgIdToPendingCompaction);
        return pendingMap;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) IOException(java.io.IOException) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) File(java.io.File) Pair(org.apache.hudi.common.util.collection.Pair)

Example 90 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CleanMetadataV2MigrationHandler method upgradeFrom.

@Override
public HoodieCleanMetadata upgradeFrom(HoodieCleanMetadata input) {
    ValidationUtils.checkArgument(input.getVersion() == 1, "Input version is " + input.getVersion() + ". Must be 1");
    HoodieCleanMetadata metadata = new HoodieCleanMetadata();
    metadata.setEarliestCommitToRetain(input.getEarliestCommitToRetain());
    metadata.setTimeTakenInMillis(input.getTimeTakenInMillis());
    metadata.setStartCleanTime(input.getStartCleanTime());
    metadata.setTotalFilesDeleted(input.getTotalFilesDeleted());
    metadata.setVersion(getManagedVersion());
    Map<String, HoodieCleanPartitionMetadata> partitionMetadataMap = input.getPartitionMetadata().entrySet().stream().map(entry -> {
        final String partitionPath = entry.getKey();
        final HoodieCleanPartitionMetadata partitionMetadata = entry.getValue();
        final List<String> deletePathPatterns = convertToV2Path(partitionMetadata.getDeletePathPatterns());
        final List<String> successDeleteFiles = convertToV2Path(partitionMetadata.getSuccessDeleteFiles());
        final List<String> failedDeleteFiles = convertToV2Path(partitionMetadata.getFailedDeleteFiles());
        final HoodieCleanPartitionMetadata cleanPartitionMetadata = HoodieCleanPartitionMetadata.newBuilder().setPolicy(partitionMetadata.getPolicy()).setPartitionPath(partitionMetadata.getPartitionPath()).setDeletePathPatterns(deletePathPatterns).setSuccessDeleteFiles(successDeleteFiles).setFailedDeleteFiles(failedDeleteFiles).build();
        return Pair.of(partitionPath, cleanPartitionMetadata);
    }).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    return HoodieCleanMetadata.newBuilder().setEarliestCommitToRetain(input.getEarliestCommitToRetain()).setStartCleanTime(input.getStartCleanTime()).setTimeTakenInMillis(input.getTimeTakenInMillis()).setTotalFilesDeleted(input.getTotalFilesDeleted()).setPartitionMetadata(partitionMetadataMap).setVersion(getManagedVersion()).build();
}
Also used : AbstractMigratorBase(org.apache.hudi.common.table.timeline.versioning.AbstractMigratorBase) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) Collectors(java.util.stream.Collectors) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) List(java.util.List)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46