use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class IncrementalTimelineSyncFileSystemView method applyDeltaFileSlicesToPartitionView.
/**
* Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
* some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
* RocksDB), override this method.
*
* @param partition PartitionPath
* @param deltaFileGroups Changed file-slices aggregated as file-groups
* @param mode Delta Apply mode
*/
protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
if (deltaFileGroups.isEmpty()) {
LOG.info("No delta file groups for partition :" + partition);
return;
}
List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
/**
* Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the
* base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
*/
Map<String, HoodieBaseFile> viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
Map<String, HoodieBaseFile> deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
switch(mode) {
case ADD:
viewDataFiles.putAll(deltaDataFiles);
viewLogFiles.putAll(deltaLogFiles);
break;
case REMOVE:
deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
break;
default:
throw new IllegalStateException("Unknown diff apply mode=" + mode);
}
HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true);
storePartitionView(partition, fgs);
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class RocksDbBasedFileSystemView method applyDeltaFileSlicesToPartitionView.
@Override
protected /*
* This is overridden to incrementally apply file-slices to rocks DB
*/
void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
rocksDB.writeBatch(batch -> deltaFileGroups.forEach(fg -> fg.getAllRawFileSlices().map(fs -> {
FileSlice oldSlice = getFileSlice(partition, fs.getFileId(), fs.getBaseInstantTime());
if (null == oldSlice) {
return fs;
} else {
// First remove the file-slice
LOG.info("Removing old Slice in DB. FS=" + oldSlice);
rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, oldSlice));
rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, oldSlice));
Map<String, HoodieLogFile> logFiles = oldSlice.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> deltaLogFiles = fs.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
switch(mode) {
case ADD:
{
FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
fs.getBaseFile().ifPresent(newFileSlice::setBaseFile);
Map<String, HoodieLogFile> newLogFiles = new HashMap<>(logFiles);
deltaLogFiles.entrySet().stream().filter(e -> !logFiles.containsKey(e.getKey())).forEach(p -> newLogFiles.put(p.getKey(), p.getValue()));
newLogFiles.values().forEach(newFileSlice::addLogFile);
LOG.info("Adding back new File Slice after add FS=" + newFileSlice);
return newFileSlice;
}
case REMOVE:
{
LOG.info("Removing old File Slice =" + fs);
FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
fs.getBaseFile().orElseGet(() -> {
oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
return null;
});
deltaLogFiles.keySet().forEach(logFiles::remove);
// Add remaining log files back
logFiles.values().forEach(newFileSlice::addLogFile);
if (newFileSlice.getBaseFile().isPresent() || (newFileSlice.getLogFiles().count() > 0)) {
LOG.info("Adding back new file-slice after remove FS=" + newFileSlice);
return newFileSlice;
}
return null;
}
default:
throw new IllegalStateException("Unknown diff apply mode=" + mode);
}
}
}).filter(Objects::nonNull).forEach(fs -> {
rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, fs), fs);
fs.getBaseFile().ifPresent(df -> rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, fs), df));
})));
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class RocksDbBasedFileSystemView method getFileGroups.
private Stream<HoodieFileGroup> getFileGroups(Stream<FileSlice> sliceStream) {
return sliceStream.map(s -> Pair.of(Pair.of(s.getPartitionPath(), s.getFileId()), s)).collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream().map(slicePair -> {
HoodieFileGroup fg = new HoodieFileGroup(slicePair.getKey().getKey(), slicePair.getKey().getValue(), getVisibleCommitsAndCompactionTimeline());
slicePair.getValue().forEach(e -> fg.addFileSlice(e.getValue()));
return fg;
});
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class SpillableMapBasedFileSystemView method createFileIdToPendingCompactionMap.
@Override
protected Map<HoodieFileGroupId, Pair<String, CompactionOperation>> createFileIdToPendingCompactionMap(Map<HoodieFileGroupId, Pair<String, CompactionOperation>> fgIdToPendingCompaction) {
try {
LOG.info("Creating Pending Compaction map using external spillable Map. Max Mem=" + maxMemoryForPendingCompaction + ", BaseDir=" + baseStoreDir);
new File(baseStoreDir).mkdirs();
Map<HoodieFileGroupId, Pair<String, CompactionOperation>> pendingMap = new ExternalSpillableMap<>(maxMemoryForPendingCompaction, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), diskMapType, isBitCaskDiskMapCompressionEnabled);
pendingMap.putAll(fgIdToPendingCompaction);
return pendingMap;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class CleanMetadataV2MigrationHandler method upgradeFrom.
@Override
public HoodieCleanMetadata upgradeFrom(HoodieCleanMetadata input) {
ValidationUtils.checkArgument(input.getVersion() == 1, "Input version is " + input.getVersion() + ". Must be 1");
HoodieCleanMetadata metadata = new HoodieCleanMetadata();
metadata.setEarliestCommitToRetain(input.getEarliestCommitToRetain());
metadata.setTimeTakenInMillis(input.getTimeTakenInMillis());
metadata.setStartCleanTime(input.getStartCleanTime());
metadata.setTotalFilesDeleted(input.getTotalFilesDeleted());
metadata.setVersion(getManagedVersion());
Map<String, HoodieCleanPartitionMetadata> partitionMetadataMap = input.getPartitionMetadata().entrySet().stream().map(entry -> {
final String partitionPath = entry.getKey();
final HoodieCleanPartitionMetadata partitionMetadata = entry.getValue();
final List<String> deletePathPatterns = convertToV2Path(partitionMetadata.getDeletePathPatterns());
final List<String> successDeleteFiles = convertToV2Path(partitionMetadata.getSuccessDeleteFiles());
final List<String> failedDeleteFiles = convertToV2Path(partitionMetadata.getFailedDeleteFiles());
final HoodieCleanPartitionMetadata cleanPartitionMetadata = HoodieCleanPartitionMetadata.newBuilder().setPolicy(partitionMetadata.getPolicy()).setPartitionPath(partitionMetadata.getPartitionPath()).setDeletePathPatterns(deletePathPatterns).setSuccessDeleteFiles(successDeleteFiles).setFailedDeleteFiles(failedDeleteFiles).build();
return Pair.of(partitionPath, cleanPartitionMetadata);
}).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
return HoodieCleanMetadata.newBuilder().setEarliestCommitToRetain(input.getEarliestCommitToRetain()).setStartCleanTime(input.getStartCleanTime()).setTimeTakenInMillis(input.getTimeTakenInMillis()).setTotalFilesDeleted(input.getTotalFilesDeleted()).setPartitionMetadata(partitionMetadataMap).setVersion(getManagedVersion()).build();
}
Aggregations