Search in sources :

Example 71 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieROTablePathFilter method accept.

@Override
public boolean accept(Path path) {
    if (engineContext == null) {
        this.engineContext = new HoodieLocalEngineContext(this.conf.get());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Checking acceptance for path " + path);
    }
    Path folder = null;
    try {
        if (fs == null) {
            fs = path.getFileSystem(conf.get());
        }
        // Assumes path is a file
        // get the immediate parent.
        folder = path.getParent();
        // Try to use the caches.
        if (nonHoodiePathCache.contains(folder.toString())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Accepting non-hoodie path from cache: " + path);
            }
            return true;
        }
        if (hoodiePathCache.containsKey(folder.toString())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
            }
            return hoodiePathCache.get(folder.toString()).contains(path);
        }
        // Skip all files that are descendants of .hoodie in its path.
        String filePath = path.toString();
        if (filePath.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/") || filePath.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME)) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("Skipping Hoodie Metadata file  %s \n", filePath));
            }
            return false;
        }
        // Perform actual checking.
        Path baseDir;
        if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
            HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
            metadata.readFromFS();
            baseDir = HoodieHiveUtils.getNthParent(folder, metadata.getPartitionDepth());
        } else {
            baseDir = safeGetParentsParent(folder);
        }
        if (baseDir != null) {
            // Check whether baseDir in nonHoodiePathCache
            if (nonHoodiePathCache.contains(baseDir.toString())) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Accepting non-hoodie path from cache: " + path);
                }
                return true;
            }
            HoodieTableFileSystemView fsView = null;
            try {
                HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
                if (null == metaClient) {
                    metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).setLoadActiveTimelineOnLoad(true).build();
                    metaClientCache.put(baseDir.toString(), metaClient);
                }
                fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf()));
                String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
                List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
                // populate the cache
                if (!hoodiePathCache.containsKey(folder.toString())) {
                    hoodiePathCache.put(folder.toString(), new HashSet<>());
                }
                LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " + latestFiles.size() + " files under " + folder);
                for (HoodieBaseFile lfile : latestFiles) {
                    hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath()));
                }
                // accept the path, if its among the latest files.
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("%s checked after cache population, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
                }
                return hoodiePathCache.get(folder.toString()).contains(path);
            } catch (TableNotFoundException e) {
                // Non-hoodie path, accept it.
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("(1) Caching non-hoodie path under %s with basePath %s \n", folder.toString(), baseDir.toString()));
                }
                nonHoodiePathCache.add(folder.toString());
                nonHoodiePathCache.add(baseDir.toString());
                return true;
            } finally {
                if (fsView != null) {
                    fsView.close();
                }
            }
        } else {
            // files is at < 3 level depth in FS tree, can't be hoodie dataset
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString()));
            }
            nonHoodiePathCache.add(folder.toString());
            return true;
        }
    } catch (Exception e) {
        String msg = "Error checking path :" + path + ", under folder: " + folder;
        LOG.error(msg, e);
        throw new HoodieException(msg, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieException(org.apache.hudi.exception.HoodieException) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20