Search in sources :

Example 41 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class HoodieInputFormatUtils method filterIncrementalFileStatus.

/**
 * Filter a list of FileStatus based on commitsToCheck for incremental view.
 * @param job
 * @param tableMetaClient
 * @param timeline
 * @param fileStatuses
 * @param commitsToCheck
 * @return
 */
public static List<FileStatus> filterIncrementalFileStatus(Job job, HoodieTableMetaClient tableMetaClient, HoodieTimeline timeline, FileStatus[] fileStatuses, List<HoodieInstant> commitsToCheck) throws IOException {
    TableFileSystemView.BaseFileOnlyView roView = new HoodieTableFileSystemView(tableMetaClient, timeline, fileStatuses);
    List<String> commitsList = commitsToCheck.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    List<HoodieBaseFile> filteredFiles = roView.getLatestBaseFilesInRange(commitsList).collect(Collectors.toList());
    List<FileStatus> returns = new ArrayList<>();
    for (HoodieBaseFile filteredFile : filteredFiles) {
        LOG.debug("Processing incremental hoodie file - " + filteredFile.getPath());
        filteredFile = refreshFileStatus(job.getConfiguration(), filteredFile);
        returns.add(getFileStatus(filteredFile));
    }
    LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size());
    return returns;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView)

Example 42 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class HoodieROTablePathFilter method accept.

@Override
public boolean accept(Path path) {
    if (engineContext == null) {
        this.engineContext = new HoodieLocalEngineContext(this.conf.get());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Checking acceptance for path " + path);
    }
    Path folder = null;
    try {
        if (fs == null) {
            fs = path.getFileSystem(conf.get());
        }
        // Assumes path is a file
        // get the immediate parent.
        folder = path.getParent();
        // Try to use the caches.
        if (nonHoodiePathCache.contains(folder.toString())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Accepting non-hoodie path from cache: " + path);
            }
            return true;
        }
        if (hoodiePathCache.containsKey(folder.toString())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
            }
            return hoodiePathCache.get(folder.toString()).contains(path);
        }
        // Skip all files that are descendants of .hoodie in its path.
        String filePath = path.toString();
        if (filePath.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/") || filePath.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME)) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("Skipping Hoodie Metadata file  %s \n", filePath));
            }
            return false;
        }
        // Perform actual checking.
        Path baseDir;
        if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
            HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
            metadata.readFromFS();
            baseDir = HoodieHiveUtils.getNthParent(folder, metadata.getPartitionDepth());
        } else {
            baseDir = safeGetParentsParent(folder);
        }
        if (baseDir != null) {
            // Check whether baseDir in nonHoodiePathCache
            if (nonHoodiePathCache.contains(baseDir.toString())) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Accepting non-hoodie path from cache: " + path);
                }
                return true;
            }
            HoodieTableFileSystemView fsView = null;
            try {
                HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
                if (null == metaClient) {
                    metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).setLoadActiveTimelineOnLoad(true).build();
                    metaClientCache.put(baseDir.toString(), metaClient);
                }
                fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf()));
                String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
                List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
                // populate the cache
                if (!hoodiePathCache.containsKey(folder.toString())) {
                    hoodiePathCache.put(folder.toString(), new HashSet<>());
                }
                LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " + latestFiles.size() + " files under " + folder);
                for (HoodieBaseFile lfile : latestFiles) {
                    hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath()));
                }
                // accept the path, if its among the latest files.
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("%s checked after cache population, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
                }
                return hoodiePathCache.get(folder.toString()).contains(path);
            } catch (TableNotFoundException e) {
                // Non-hoodie path, accept it.
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("(1) Caching non-hoodie path under %s with basePath %s \n", folder.toString(), baseDir.toString()));
                }
                nonHoodiePathCache.add(folder.toString());
                nonHoodiePathCache.add(baseDir.toString());
                return true;
            } finally {
                if (fsView != null) {
                    fsView.close();
                }
            }
        } else {
            // files is at < 3 level depth in FS tree, can't be hoodie dataset
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString()));
            }
            nonHoodiePathCache.add(folder.toString());
            return true;
        }
    } catch (Exception e) {
        String msg = "Error checking path :" + path + ", under folder: " + folder;
        LOG.error(msg, e);
        throw new HoodieException(msg, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieException(org.apache.hudi.exception.HoodieException) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException)

Aggregations

HoodieTableFileSystemView (org.apache.hudi.common.table.view.HoodieTableFileSystemView)42 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)29 FileStatus (org.apache.hadoop.fs.FileStatus)25 Path (org.apache.hadoop.fs.Path)24 IOException (java.io.IOException)22 ArrayList (java.util.ArrayList)22 FileSlice (org.apache.hudi.common.model.FileSlice)22 List (java.util.List)21 Collectors (java.util.stream.Collectors)20 Option (org.apache.hudi.common.util.Option)20 Map (java.util.Map)19 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)19 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)18 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)17 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)16 HoodieIOException (org.apache.hudi.exception.HoodieIOException)16 HoodieException (org.apache.hudi.exception.HoodieException)15 Stream (java.util.stream.Stream)14 Test (org.junit.jupiter.api.Test)13 HashMap (java.util.HashMap)12