use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class HoodieInputFormatUtils method filterIncrementalFileStatus.
/**
* Filter a list of FileStatus based on commitsToCheck for incremental view.
* @param job
* @param tableMetaClient
* @param timeline
* @param fileStatuses
* @param commitsToCheck
* @return
*/
public static List<FileStatus> filterIncrementalFileStatus(Job job, HoodieTableMetaClient tableMetaClient, HoodieTimeline timeline, FileStatus[] fileStatuses, List<HoodieInstant> commitsToCheck) throws IOException {
TableFileSystemView.BaseFileOnlyView roView = new HoodieTableFileSystemView(tableMetaClient, timeline, fileStatuses);
List<String> commitsList = commitsToCheck.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
List<HoodieBaseFile> filteredFiles = roView.getLatestBaseFilesInRange(commitsList).collect(Collectors.toList());
List<FileStatus> returns = new ArrayList<>();
for (HoodieBaseFile filteredFile : filteredFiles) {
LOG.debug("Processing incremental hoodie file - " + filteredFile.getPath());
filteredFile = refreshFileStatus(job.getConfiguration(), filteredFile);
returns.add(getFileStatus(filteredFile));
}
LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size());
return returns;
}
use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class HoodieROTablePathFilter method accept.
@Override
public boolean accept(Path path) {
if (engineContext == null) {
this.engineContext = new HoodieLocalEngineContext(this.conf.get());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Checking acceptance for path " + path);
}
Path folder = null;
try {
if (fs == null) {
fs = path.getFileSystem(conf.get());
}
// Assumes path is a file
// get the immediate parent.
folder = path.getParent();
// Try to use the caches.
if (nonHoodiePathCache.contains(folder.toString())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Accepting non-hoodie path from cache: " + path);
}
return true;
}
if (hoodiePathCache.containsKey(folder.toString())) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
}
return hoodiePathCache.get(folder.toString()).contains(path);
}
// Skip all files that are descendants of .hoodie in its path.
String filePath = path.toString();
if (filePath.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/") || filePath.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME)) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Skipping Hoodie Metadata file %s \n", filePath));
}
return false;
}
// Perform actual checking.
Path baseDir;
if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
metadata.readFromFS();
baseDir = HoodieHiveUtils.getNthParent(folder, metadata.getPartitionDepth());
} else {
baseDir = safeGetParentsParent(folder);
}
if (baseDir != null) {
// Check whether baseDir in nonHoodiePathCache
if (nonHoodiePathCache.contains(baseDir.toString())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Accepting non-hoodie path from cache: " + path);
}
return true;
}
HoodieTableFileSystemView fsView = null;
try {
HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
if (null == metaClient) {
metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).setLoadActiveTimelineOnLoad(true).build();
metaClientCache.put(baseDir.toString(), metaClient);
}
fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf()));
String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
// populate the cache
if (!hoodiePathCache.containsKey(folder.toString())) {
hoodiePathCache.put(folder.toString(), new HashSet<>());
}
LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " + latestFiles.size() + " files under " + folder);
for (HoodieBaseFile lfile : latestFiles) {
hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath()));
}
// accept the path, if its among the latest files.
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("%s checked after cache population, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
}
return hoodiePathCache.get(folder.toString()).contains(path);
} catch (TableNotFoundException e) {
// Non-hoodie path, accept it.
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("(1) Caching non-hoodie path under %s with basePath %s \n", folder.toString(), baseDir.toString()));
}
nonHoodiePathCache.add(folder.toString());
nonHoodiePathCache.add(baseDir.toString());
return true;
} finally {
if (fsView != null) {
fsView.close();
}
}
} else {
// files is at < 3 level depth in FS tree, can't be hoodie dataset
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString()));
}
nonHoodiePathCache.add(folder.toString());
return true;
}
} catch (Exception e) {
String msg = "Error checking path :" + path + ", under folder: " + folder;
LOG.error(msg, e);
throw new HoodieException(msg, e);
}
}
Aggregations