use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class DFSHoodieDatasetInputReader method getSingleSliceFromRDD.
private FileSlice getSingleSliceFromRDD(JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice) {
return partitionToFileSlice.map(f -> {
FileSlice slice = f._2.next();
FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime());
if (slice.getBaseFile().isPresent()) {
newSlice.setBaseFile(slice.getBaseFile().get());
} else {
slice.getLogFiles().forEach(l -> {
newSlice.addLogFile(l);
});
}
return newSlice;
}).take(1).get(0);
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class TimelineServerPerf method runOneRound.
private static PerfStats runOneRound(SyncableFileSystemView fsView, String partition, String fileId, int id, int numIterations) {
Histogram latencyHistogram = new Histogram(new UniformReservoir(10000));
for (int i = 0; i < numIterations; i++) {
long beginTs = System.currentTimeMillis();
Option<FileSlice> c = fsView.getLatestFileSlice(partition, fileId);
long endTs = System.currentTimeMillis();
System.out.println("Latest File Slice for part=" + partition + ", fileId=" + fileId + ", Slice=" + c + ", Time=" + (endTs - beginTs));
latencyHistogram.update(endTs - beginTs);
}
return new PerfStats(partition, id, latencyHistogram.getSnapshot());
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class CompactionAdminClient method getRenamingActionsForUnschedulingCompactionOperation.
/**
* Generate renaming actions for unscheduling a compaction operation NOTE: Can only be used safely when no writer
* (ingestion/compaction) is running.
*
* @param metaClient Hoodie Table MetaClient
* @param compactionInstant Compaction Instant
* @param operation Compaction Operation
* @param fsViewOpt Cached File System View
* @param skipValidation Skip Validation
* @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
* compaction.
*/
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
if (!skipValidation) {
validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
}
HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()).filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
List<HoodieLogFile> logFilesToRepair = merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)).sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true).filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension()).orElse(HoodieLogFile.DELTA_EXTENSION);
String parentPath = fileSliceForCompaction.getBaseFile().map(df -> new Path(df.getPath()).getParent().toString()).orElse(fileSliceForCompaction.getLogFiles().findFirst().map(lf -> lf.getPath().getParent().toString()).get());
for (HoodieLogFile toRepair : logFilesToRepair) {
int version = maxUsedVersion + 1;
HoodieLogFile newLf = new HoodieLogFile(new Path(parentPath, FSUtils.makeLogFileName(operation.getFileId(), logExtn, operation.getBaseInstantTime(), version, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
result.add(Pair.of(toRepair, newLf));
maxUsedVersion = version;
}
return result;
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class FileSliceMetricUtils method addFileSliceCommonMetrics.
public static void addFileSliceCommonMetrics(List<FileSlice> fileSlices, Map<String, Double> metrics, long defaultBaseFileSize) {
int numLogFiles = 0;
long totalLogFileSize = 0;
long totalIORead = 0;
long totalIOWrite = 0;
long totalIO = 0;
for (FileSlice slice : fileSlices) {
numLogFiles += slice.getLogFiles().count();
// Total size of all the log files
totalLogFileSize += slice.getLogFiles().map(HoodieLogFile::getFileSize).filter(size -> size >= 0).reduce(Long::sum).orElse(0L);
long baseFileSize = slice.getBaseFile().isPresent() ? slice.getBaseFile().get().getFileSize() : 0L;
totalIORead += baseFileSize;
// Total write will be similar to the size of the base file
totalIOWrite += baseFileSize > 0 ? baseFileSize : defaultBaseFileSize;
}
// Total read will be the base file + all the log files
totalIORead = FSUtils.getSizeInMB(totalIORead + totalLogFileSize);
totalIOWrite = FSUtils.getSizeInMB(totalIOWrite);
// Total IO will be the IO for read + write
totalIO = totalIORead + totalIOWrite;
metrics.put(TOTAL_IO_READ_MB, (double) totalIORead);
metrics.put(TOTAL_IO_WRITE_MB, (double) totalIOWrite);
metrics.put(TOTAL_IO_MB, (double) totalIO);
metrics.put(TOTAL_LOG_FILE_SIZE, (double) totalLogFileSize);
metrics.put(TOTAL_LOG_FILES, (double) numLogFiles);
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class IncrementalTimelineSyncFileSystemView method applyDeltaFileSlicesToPartitionView.
/**
* Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
* some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
* RocksDB), override this method.
*
* @param partition PartitionPath
* @param deltaFileGroups Changed file-slices aggregated as file-groups
* @param mode Delta Apply mode
*/
protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) {
if (deltaFileGroups.isEmpty()) {
LOG.info("No delta file groups for partition :" + partition);
return;
}
List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
/**
* Note that while finding the new data/log files added/removed, the path stored in metadata will be missing the
* base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
*/
Map<String, HoodieBaseFile> viewDataFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
Map<String, HoodieBaseFile> deltaDataFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles).map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
switch(mode) {
case ADD:
viewDataFiles.putAll(deltaDataFiles);
viewLogFiles.putAll(deltaLogFiles);
break;
case REMOVE:
deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
break;
default:
throw new IllegalStateException("Unknown diff apply mode=" + mode);
}
HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true);
storePartitionView(partition, fgs);
}
Aggregations