Search in sources :

Example 16 with HoodieLocalEngineContext

use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.

the class MetadataCommand method listFiles.

@CliCommand(value = "metadata list-files", help = "Print a list of all files in a partition from the metadata")
public String listFiles(@CliOption(key = { "partition" }, help = "Name of the partition to list files", mandatory = true) final String partition) throws IOException {
    HoodieCLI.getTableMetaClient();
    HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
    HoodieBackedTableMetadata metaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
    if (!metaReader.enabled()) {
        return "[ERROR] Metadata Table not enabled/initialized\n\n";
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    FileStatus[] statuses = metaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
    LOG.debug("Took " + timer.endTimer() + " ms");
    final List<Comparable[]> rows = new ArrayList<>();
    Arrays.stream(statuses).sorted((p1, p2) -> p2.getPath().getName().compareTo(p1.getPath().getName())).forEach(f -> {
        Comparable[] row = new Comparable[1];
        row[0] = f;
        rows.add(row);
    });
    TableHeader header = new TableHeader().addTableHeaderField("file path");
    return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) CommandMarker(org.springframework.shell.core.CommandMarker) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) CliCommand(org.springframework.shell.core.annotation.CliCommand) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) TableHeader(org.apache.hudi.cli.TableHeader) Set(java.util.Set) IOException(java.io.IOException) SparkUtil(org.apache.hudi.cli.utils.SparkUtil) FileNotFoundException(java.io.FileNotFoundException) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FileStatus(org.apache.hadoop.fs.FileStatus) TableHeader(org.apache.hudi.cli.TableHeader) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 17 with HoodieLocalEngineContext

use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.

the class MetadataCommand method stats.

@CliCommand(value = "metadata stats", help = "Print stats about the metadata")
public String stats() throws IOException {
    HoodieCLI.getTableMetaClient();
    HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
    HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
    Map<String, String> stats = metadata.stats();
    final List<Comparable[]> rows = new ArrayList<>();
    for (Map.Entry<String, String> entry : stats.entrySet()) {
        Comparable[] row = new Comparable[2];
        row[0] = entry.getKey();
        row[1] = entry.getValue();
        rows.add(row);
    }
    TableHeader header = new TableHeader().addTableHeaderField("stat key").addTableHeaderField("stat value");
    return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
Also used : TableHeader(org.apache.hudi.cli.TableHeader) ArrayList(java.util.ArrayList) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) HashMap(java.util.HashMap) Map(java.util.Map) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 18 with HoodieLocalEngineContext

use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.

the class MetadataCommand method validateFiles.

@CliCommand(value = "metadata validate-files", help = "Validate all files in all partitions from the metadata")
public String validateFiles(@CliOption(key = { "verbose" }, help = "Print all file details", unspecifiedDefaultValue = "false") final boolean verbose) throws IOException {
    HoodieCLI.getTableMetaClient();
    HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
    HoodieBackedTableMetadata metadataReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
    if (!metadataReader.enabled()) {
        return "[ERROR] Metadata Table not enabled/initialized\n\n";
    }
    HoodieMetadataConfig fsConfig = HoodieMetadataConfig.newBuilder().enable(false).build();
    HoodieBackedTableMetadata fsMetaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), fsConfig, HoodieCLI.basePath, "/tmp");
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<String> metadataPartitions = metadataReader.getAllPartitionPaths();
    LOG.debug("Listing partitions Took " + timer.endTimer() + " ms");
    List<String> fsPartitions = fsMetaReader.getAllPartitionPaths();
    Collections.sort(fsPartitions);
    Collections.sort(metadataPartitions);
    Set<String> allPartitions = new HashSet<>();
    allPartitions.addAll(fsPartitions);
    allPartitions.addAll(metadataPartitions);
    if (!fsPartitions.equals(metadataPartitions)) {
        LOG.error("FS partition listing is not matching with metadata partition listing!");
        LOG.error("All FS partitions: " + Arrays.toString(fsPartitions.toArray()));
        LOG.error("All Metadata partitions: " + Arrays.toString(metadataPartitions.toArray()));
    }
    final List<Comparable[]> rows = new ArrayList<>();
    for (String partition : allPartitions) {
        Map<String, FileStatus> fileStatusMap = new HashMap<>();
        Map<String, FileStatus> metadataFileStatusMap = new HashMap<>();
        FileStatus[] metadataStatuses = metadataReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
        Arrays.stream(metadataStatuses).forEach(entry -> metadataFileStatusMap.put(entry.getPath().getName(), entry));
        FileStatus[] fsStatuses = fsMetaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
        Arrays.stream(fsStatuses).forEach(entry -> fileStatusMap.put(entry.getPath().getName(), entry));
        Set<String> allFiles = new HashSet<>();
        allFiles.addAll(fileStatusMap.keySet());
        allFiles.addAll(metadataFileStatusMap.keySet());
        for (String file : allFiles) {
            Comparable[] row = new Comparable[6];
            row[0] = partition;
            FileStatus fsFileStatus = fileStatusMap.get(file);
            FileStatus metaFileStatus = metadataFileStatusMap.get(file);
            boolean doesFsFileExists = fsFileStatus != null;
            boolean doesMetadataFileExists = metaFileStatus != null;
            long fsFileLength = doesFsFileExists ? fsFileStatus.getLen() : 0;
            long metadataFileLength = doesMetadataFileExists ? metaFileStatus.getLen() : 0;
            row[1] = file;
            row[2] = doesFsFileExists;
            row[3] = doesMetadataFileExists;
            row[4] = fsFileLength;
            row[5] = metadataFileLength;
            if (verbose) {
                // if verbose print all files
                rows.add(row);
            } else if ((doesFsFileExists != doesMetadataFileExists) || (fsFileLength != metadataFileLength)) {
                // if non verbose, print only non matching files
                rows.add(row);
            }
        }
        if (metadataStatuses.length != fsStatuses.length) {
            LOG.error(" FS and metadata files count not matching for " + partition + ". FS files count " + fsStatuses.length + ", metadata base files count " + metadataStatuses.length);
        }
        for (Map.Entry<String, FileStatus> entry : fileStatusMap.entrySet()) {
            if (!metadataFileStatusMap.containsKey(entry.getKey())) {
                LOG.error("FS file not found in metadata " + entry.getKey());
            } else {
                if (entry.getValue().getLen() != metadataFileStatusMap.get(entry.getKey()).getLen()) {
                    LOG.error(" FS file size mismatch " + entry.getKey() + ", size equality " + (entry.getValue().getLen() == metadataFileStatusMap.get(entry.getKey()).getLen()) + ". FS size " + entry.getValue().getLen() + ", metadata size " + metadataFileStatusMap.get(entry.getKey()).getLen());
                }
            }
        }
        for (Map.Entry<String, FileStatus> entry : metadataFileStatusMap.entrySet()) {
            if (!fileStatusMap.containsKey(entry.getKey())) {
                LOG.error("Metadata file not found in FS " + entry.getKey());
            } else {
                if (entry.getValue().getLen() != fileStatusMap.get(entry.getKey()).getLen()) {
                    LOG.error(" Metadata file size mismatch " + entry.getKey() + ", size equality " + (entry.getValue().getLen() == fileStatusMap.get(entry.getKey()).getLen()) + ". Metadata size " + entry.getValue().getLen() + ", FS size " + metadataFileStatusMap.get(entry.getKey()).getLen());
                }
            }
        }
    }
    TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("File Name").addTableHeaderField(" Is Present in FS ").addTableHeaderField(" Is Present in Metadata").addTableHeaderField(" FS size").addTableHeaderField(" Metadata size");
    return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) TableHeader(org.apache.hudi.cli.TableHeader) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 19 with HoodieLocalEngineContext

use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.

the class HoodieCopyOnWriteTableInputFormat method listStatusForSnapshotMode.

@Nonnull
private List<FileStatus> listStatusForSnapshotMode(JobConf job, Map<String, HoodieTableMetaClient> tableMetaClientMap, List<Path> snapshotPaths) throws IOException {
    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(job);
    List<FileStatus> targetFiles = new ArrayList<>();
    TypedProperties props = new TypedProperties(new Properties());
    Map<HoodieTableMetaClient, List<Path>> groupedPaths = HoodieInputFormatUtils.groupSnapshotPathsByMetaClient(tableMetaClientMap.values(), snapshotPaths);
    for (Map.Entry<HoodieTableMetaClient, List<Path>> entry : groupedPaths.entrySet()) {
        HoodieTableMetaClient tableMetaClient = entry.getKey();
        List<Path> partitionPaths = entry.getValue();
        // Hive job might specify a max commit instant up to which table's state
        // should be examined. We simply pass it as query's instant to the file-index
        Option<String> queryCommitInstant = HoodieHiveUtils.getMaxCommit(job, tableMetaClient.getTableConfig().getTableName());
        boolean shouldIncludePendingCommits = HoodieHiveUtils.shouldIncludePendingCommits(job, tableMetaClient.getTableConfig().getTableName());
        HiveHoodieTableFileIndex fileIndex = new HiveHoodieTableFileIndex(engineContext, tableMetaClient, props, HoodieTableQueryType.SNAPSHOT, partitionPaths, queryCommitInstant, shouldIncludePendingCommits);
        Map<String, List<FileSlice>> partitionedFileSlices = fileIndex.listFileSlices();
        Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient);
        targetFiles.addAll(partitionedFileSlices.values().stream().flatMap(Collection::stream).map(fileSlice -> createFileStatusUnchecked(fileSlice, fileIndex, virtualKeyInfoOpt)).collect(Collectors.toList()));
    }
    return targetFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) TypedProperties(org.apache.hudi.common.config.TypedProperties) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieVirtualKeyInfo(org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) Nonnull(javax.annotation.Nonnull)

Example 20 with HoodieLocalEngineContext

use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.

the class HoodieROTablePathFilter method accept.

@Override
public boolean accept(Path path) {
    if (engineContext == null) {
        this.engineContext = new HoodieLocalEngineContext(this.conf.get());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Checking acceptance for path " + path);
    }
    Path folder = null;
    try {
        if (fs == null) {
            fs = path.getFileSystem(conf.get());
        }
        // Assumes path is a file
        // get the immediate parent.
        folder = path.getParent();
        // Try to use the caches.
        if (nonHoodiePathCache.contains(folder.toString())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Accepting non-hoodie path from cache: " + path);
            }
            return true;
        }
        if (hoodiePathCache.containsKey(folder.toString())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
            }
            return hoodiePathCache.get(folder.toString()).contains(path);
        }
        // Skip all files that are descendants of .hoodie in its path.
        String filePath = path.toString();
        if (filePath.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/") || filePath.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME)) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("Skipping Hoodie Metadata file  %s \n", filePath));
            }
            return false;
        }
        // Perform actual checking.
        Path baseDir;
        if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
            HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
            metadata.readFromFS();
            baseDir = HoodieHiveUtils.getNthParent(folder, metadata.getPartitionDepth());
        } else {
            baseDir = safeGetParentsParent(folder);
        }
        if (baseDir != null) {
            // Check whether baseDir in nonHoodiePathCache
            if (nonHoodiePathCache.contains(baseDir.toString())) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Accepting non-hoodie path from cache: " + path);
                }
                return true;
            }
            HoodieTableFileSystemView fsView = null;
            try {
                HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
                if (null == metaClient) {
                    metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).setLoadActiveTimelineOnLoad(true).build();
                    metaClientCache.put(baseDir.toString(), metaClient);
                }
                fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf()));
                String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
                List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
                // populate the cache
                if (!hoodiePathCache.containsKey(folder.toString())) {
                    hoodiePathCache.put(folder.toString(), new HashSet<>());
                }
                LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " + latestFiles.size() + " files under " + folder);
                for (HoodieBaseFile lfile : latestFiles) {
                    hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath()));
                }
                // accept the path, if its among the latest files.
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("%s checked after cache population, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
                }
                return hoodiePathCache.get(folder.toString()).contains(path);
            } catch (TableNotFoundException e) {
                // Non-hoodie path, accept it.
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("(1) Caching non-hoodie path under %s with basePath %s \n", folder.toString(), baseDir.toString()));
                }
                nonHoodiePathCache.add(folder.toString());
                nonHoodiePathCache.add(baseDir.toString());
                return true;
            } finally {
                if (fsView != null) {
                    fsView.close();
                }
            }
        } else {
            // files is at < 3 level depth in FS tree, can't be hoodie dataset
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString()));
            }
            nonHoodiePathCache.add(folder.toString());
            return true;
        }
    } catch (Exception e) {
        String msg = "Error checking path :" + path + ", under folder: " + folder;
        LOG.error(msg, e);
        throw new HoodieException(msg, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieException(org.apache.hudi.exception.HoodieException) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException)

Aggregations

HoodieLocalEngineContext (org.apache.hudi.common.engine.HoodieLocalEngineContext)20 Path (org.apache.hadoop.fs.Path)15 Map (java.util.Map)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 Test (org.junit.jupiter.api.Test)10 IOException (java.io.IOException)9 List (java.util.List)9 SerializableConfiguration (org.apache.hudi.common.config.SerializableConfiguration)8 Arrays (java.util.Arrays)7 Collections (java.util.Collections)7 HoodieMetadataConfig (org.apache.hudi.common.config.HoodieMetadataConfig)7 BeforeEach (org.junit.jupiter.api.BeforeEach)7 ArrayList (java.util.ArrayList)6 Collectors (java.util.stream.Collectors)6 HoodieCommonTestHarness (org.apache.hudi.common.testutils.HoodieCommonTestHarness)6 IntStream (java.util.stream.IntStream)5 HoodieTestTable (org.apache.hudi.common.testutils.HoodieTestTable)5 AfterEach (org.junit.jupiter.api.AfterEach)5 Assertions (org.junit.jupiter.api.Assertions)5 Configuration (org.apache.hadoop.conf.Configuration)4