use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieCompactor method compact.
/**
* Execute compaction operations and report back status.
*/
public HoodieData<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable table, HoodieWriteConfig config, String compactionInstantTime, HoodieCompactionHandler compactionHandler) {
if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) {
return context.emptyHoodieData();
}
HoodieActiveTimeline timeline = table.getActiveTimeline();
HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
// Mark instant as compaction inflight
timeline.transitionCompactionRequestedToInflight(instant);
table.getMetaClient().reloadActiveTimeline();
HoodieTableMetaClient metaClient = table.getMetaClient();
TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
// the same with the table schema.
try {
Schema readerSchema = schemaResolver.getTableAvroSchema(false);
config.setSchema(readerSchema.toString());
} catch (Exception e) {
// If there is no commit in the table, just ignore the exception.
}
// Compacting is very similar to applying updates to existing file
List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
LOG.info("Compactor compacting " + operations + " files");
context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier();
return context.parallelize(operations).map(operation -> compact(compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier)).flatMap(List::iterator);
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class BootstrapUtils method getAllLeafFoldersWithFiles.
/**
* Returns leaf folders with files under a path.
* @param metaClient Hoodie table metadata client
* @param fs File System
* @param context JHoodieEngineContext
* @return list of partition paths with files under them.
* @throws IOException
*/
public static List<Pair<String, List<HoodieFileStatus>>> getAllLeafFoldersWithFiles(HoodieTableMetaClient metaClient, FileSystem fs, String basePathStr, HoodieEngineContext context) throws IOException {
final Path basePath = new Path(basePathStr);
final String baseFileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
final Map<Integer, List<String>> levelToPartitions = new HashMap<>();
final Map<String, List<HoodieFileStatus>> partitionToFiles = new HashMap<>();
PathFilter filePathFilter = getFilePathFilter(baseFileExtension);
PathFilter metaPathFilter = getExcludeMetaPathFilter();
FileStatus[] topLevelStatuses = fs.listStatus(basePath);
List<String> subDirectories = new ArrayList<>();
List<Pair<HoodieFileStatus, Pair<Integer, String>>> result = new ArrayList<>();
for (FileStatus topLevelStatus : topLevelStatuses) {
if (topLevelStatus.isFile() && filePathFilter.accept(topLevelStatus.getPath())) {
String relativePath = FSUtils.getRelativePartitionPath(basePath, topLevelStatus.getPath().getParent());
Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(topLevelStatus);
result.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
} else if (topLevelStatus.isDirectory() && metaPathFilter.accept(topLevelStatus.getPath())) {
subDirectories.add(topLevelStatus.getPath().toString());
}
}
if (subDirectories.size() > 0) {
result.addAll(context.flatMap(subDirectories, directory -> {
PathFilter pathFilter = getFilePathFilter(baseFileExtension);
Path path = new Path(directory);
FileSystem fileSystem = path.getFileSystem(new Configuration());
RemoteIterator<LocatedFileStatus> itr = fileSystem.listFiles(path, true);
List<Pair<HoodieFileStatus, Pair<Integer, String>>> res = new ArrayList<>();
while (itr.hasNext()) {
FileStatus status = itr.next();
if (pathFilter.accept(status.getPath())) {
String relativePath = FSUtils.getRelativePartitionPath(new Path(basePathStr), status.getPath().getParent());
Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(status);
res.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
}
}
return res.stream();
}, subDirectories.size()));
}
result.forEach(val -> {
String relativePath = val.getRight().getRight();
List<HoodieFileStatus> statusList = partitionToFiles.get(relativePath);
if (null == statusList) {
Integer level = val.getRight().getLeft();
List<String> dirs = levelToPartitions.get(level);
if (null == dirs) {
dirs = new ArrayList<>();
levelToPartitions.put(level, dirs);
}
dirs.add(relativePath);
statusList = new ArrayList<>();
partitionToFiles.put(relativePath, statusList);
}
statusList.add(val.getLeft());
});
OptionalInt maxLevelOpt = levelToPartitions.keySet().stream().mapToInt(x -> x).max();
int maxLevel = maxLevelOpt.orElse(-1);
return maxLevel >= 0 ? levelToPartitions.get(maxLevel).stream().map(d -> Pair.of(d, partitionToFiles.get(d))).collect(Collectors.toList()) : new ArrayList<>();
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class CleanActionExecutor method clean.
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
* skews in partitions to clean by making files to clean as the unit of task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
int cleanerParallelism = Math.min((int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), config.getCleanerParallelism());
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
Stream<Pair<String, CleanFileInfo>> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream().flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
Stream<ImmutablePair<String, PartitionCleanStat>> partitionCleanStats = context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// Return PartitionCleanStat for each partition passed.
return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) ? partitionCleanStatsMap.get(partitionPath) : new PartitionCleanStat(partitionPath);
HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath).withEarliestCommitRetained(Option.ofNullable(actionInstant != null ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), actionInstant.getAction(), actionInstant.getTimestamp()) : null)).withDeletePathPattern(partitionCleanStat.deletePathPatterns()).withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()).withFailedDeletes(partitionCleanStat.failedDeleteFiles()).withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()).withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()).withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()).build();
}).collect(Collectors.toList());
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieTimelineArchiver method mergeArchiveFilesIfNecessary.
/**
* Here Hoodie can merge the small archive files into a new larger one.
* Only used for filesystem which does not support append operation.
* The whole merge small archive files operation has four stages:
* 1. Build merge plan with merge candidates/merged file name infos.
* 2. Do merge.
* 3. Delete all the candidates.
* 4. Delete the merge plan.
* @param context HoodieEngineContext
* @throws IOException
*/
private void mergeArchiveFilesIfNecessary(HoodieEngineContext context) throws IOException {
Path planPath = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
// Flush remained content if existed and open a new write
reOpenWriter();
// List all archive files
FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
// Sort files by version suffix in reverse (implies reverse chronological order)
Arrays.sort(fsStatuses, new HoodieArchivedTimeline.ArchiveFileVersionComparator());
int archiveMergeFilesBatchSize = config.getArchiveMergeFilesBatchSize();
long smallFileLimitBytes = config.getArchiveMergeSmallFileLimitBytes();
List<FileStatus> mergeCandidate = getMergeCandidates(smallFileLimitBytes, fsStatuses);
if (mergeCandidate.size() >= archiveMergeFilesBatchSize) {
List<String> candidateFiles = mergeCandidate.stream().map(fs -> fs.getPath().toString()).collect(Collectors.toList());
// before merge archive files build merge plan
String logFileName = computeLogFileName();
buildArchiveMergePlan(candidateFiles, planPath, logFileName);
// merge archive files
mergeArchiveFiles(mergeCandidate);
// after merge, delete the small archive files.
deleteFilesParallelize(metaClient, candidateFiles, context, true);
LOG.info("Success to delete replaced small archive files.");
// finally, delete archiveMergePlan which means merging small archive files operation is succeed.
metaClient.getFs().delete(planPath, false);
LOG.info("Success to merge small archive files.");
}
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieRepairTool method listFilesFromBasePath.
/**
* Lists all Hoodie files from the table base path.
*
* @param context {@link HoodieEngineContext} instance.
* @param basePathStr Table base path.
* @param expectedLevel Expected level in the directory hierarchy to include the file status.
* @param parallelism Parallelism for the file listing.
* @return A list of absolute file paths of all Hoodie files.
* @throws IOException upon errors.
*/
static List<String> listFilesFromBasePath(HoodieEngineContext context, String basePathStr, int expectedLevel, int parallelism) {
FileSystem fs = FSUtils.getFs(basePathStr, context.getHadoopConf().get());
Path basePath = new Path(basePathStr);
return FSUtils.getFileStatusAtLevel(context, fs, basePath, expectedLevel, parallelism).stream().filter(fileStatus -> {
if (!fileStatus.isFile()) {
return false;
}
return FSUtils.isDataFile(fileStatus.getPath());
}).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
}
Aggregations