use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.
the class FileGroupDTO method toFileGroup.
public static HoodieFileGroup toFileGroup(FileGroupDTO dto, HoodieTableMetaClient metaClient) {
HoodieFileGroup fileGroup = new HoodieFileGroup(dto.partition, dto.id, TimelineDTO.toTimeline(dto.timeline, metaClient));
dto.slices.stream().map(FileSliceDTO::toFileSlice).forEach(fileSlice -> fileGroup.addFileSlice(fileSlice));
return fileGroup;
}
use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.
the class IncrementalTimelineSyncFileSystemView method updatePartitionWriteFileGroups.
private void updatePartitionWriteFileGroups(Map<String, List<HoodieWriteStat>> partitionToWriteStats, HoodieTimeline timeline, HoodieInstant instant) {
partitionToWriteStats.entrySet().stream().forEach(entry -> {
String partition = entry.getKey();
if (isPartitionAvailableInStore(partition)) {
LOG.info("Syncing partition (" + partition + ") of instant (" + instant + ")");
FileStatus[] statuses = entry.getValue().stream().map(p -> {
FileStatus status = new FileStatus(p.getFileSizeInBytes(), false, 0, 0, 0, 0, null, null, null, new Path(String.format("%s/%s", metaClient.getBasePath(), p.getPath())));
return status;
}).toArray(FileStatus[]::new);
List<HoodieFileGroup> fileGroups = buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), false);
applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.ADD);
} else {
LOG.warn("Skipping partition (" + partition + ") when syncing instant (" + instant + ") as it is not loaded");
}
});
LOG.info("Done Syncing committed instant (" + instant + ")");
}
use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.
the class IncrementalTimelineSyncFileSystemView method applyDeltaFileSlicesToPartitionView.
/**
* Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
* some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
* RocksDB), override this method.
*
* @param partition PartitionPath
* @param deltaFileGroups Changed file-slices aggregated as file-groups
* @param mode Delta Apply mode
*/
protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
if (deltaFileGroups.isEmpty()) {
LOG.info("No delta file groups for partition :" + partition);
return;
}
List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
/**
* Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the
* base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
*/
Map<String, HoodieBaseFile> viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
Map<String, HoodieBaseFile> deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
switch(mode) {
case ADD:
viewDataFiles.putAll(deltaDataFiles);
viewLogFiles.putAll(deltaLogFiles);
break;
case REMOVE:
deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
break;
default:
throw new IllegalStateException("Unknown diff apply mode=" + mode);
}
HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true);
storePartitionView(partition, fgs);
}
use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.
the class RocksDbBasedFileSystemView method applyDeltaFileSlicesToPartitionView.
@Override
protected /*
* This is overridden to incrementally apply file-slices to rocks DB
*/
void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
rocksDB.writeBatch(batch -> deltaFileGroups.forEach(fg -> fg.getAllRawFileSlices().map(fs -> {
FileSlice oldSlice = getFileSlice(partition, fs.getFileId(), fs.getBaseInstantTime());
if (null == oldSlice) {
return fs;
} else {
// First remove the file-slice
LOG.info("Removing old Slice in DB. FS=" + oldSlice);
rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, oldSlice));
rocksDB.deleteInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, oldSlice));
Map<String, HoodieLogFile> logFiles = oldSlice.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> deltaLogFiles = fs.getLogFiles().map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
switch(mode) {
case ADD:
{
FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
fs.getBaseFile().ifPresent(newFileSlice::setBaseFile);
Map<String, HoodieLogFile> newLogFiles = new HashMap<>(logFiles);
deltaLogFiles.entrySet().stream().filter(e -> !logFiles.containsKey(e.getKey())).forEach(p -> newLogFiles.put(p.getKey(), p.getValue()));
newLogFiles.values().forEach(newFileSlice::addLogFile);
LOG.info("Adding back new File Slice after add FS=" + newFileSlice);
return newFileSlice;
}
case REMOVE:
{
LOG.info("Removing old File Slice =" + fs);
FileSlice newFileSlice = new FileSlice(oldSlice.getFileGroupId(), oldSlice.getBaseInstantTime());
fs.getBaseFile().orElseGet(() -> {
oldSlice.getBaseFile().ifPresent(newFileSlice::setBaseFile);
return null;
});
deltaLogFiles.keySet().forEach(logFiles::remove);
// Add remaining log files back
logFiles.values().forEach(newFileSlice::addLogFile);
if (newFileSlice.getBaseFile().isPresent() || (newFileSlice.getLogFiles().count() > 0)) {
LOG.info("Adding back new file-slice after remove FS=" + newFileSlice);
return newFileSlice;
}
return null;
}
default:
throw new IllegalStateException("Unknown diff apply mode=" + mode);
}
}
}).filter(Objects::nonNull).forEach(fs -> {
rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForSliceView(fg, fs), fs);
fs.getBaseFile().ifPresent(df -> rocksDB.putInBatch(batch, schemaHelper.getColFamilyForView(), schemaHelper.getKeyForDataFileView(fg, fs), df));
})));
}
use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.
the class RocksDbBasedFileSystemView method getFileGroups.
private Stream<HoodieFileGroup> getFileGroups(Stream<FileSlice> sliceStream) {
return sliceStream.map(s -> Pair.of(Pair.of(s.getPartitionPath(), s.getFileId()), s)).collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream().map(slicePair -> {
HoodieFileGroup fg = new HoodieFileGroup(slicePair.getKey().getKey(), slicePair.getKey().getValue(), getVisibleCommitsAndCompactionTimeline());
slicePair.getValue().forEach(e -> fg.addFileSlice(e.getValue()));
return fg;
});
}
Aggregations