use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.
the class MetadataCommand method listFiles.
@CliCommand(value = "metadata list-files", help = "Print a list of all files in a partition from the metadata")
public String listFiles(@CliOption(key = { "partition" }, help = "Name of the partition to list files", mandatory = true) final String partition) throws IOException {
HoodieCLI.getTableMetaClient();
HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
HoodieBackedTableMetadata metaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
if (!metaReader.enabled()) {
return "[ERROR] Metadata Table not enabled/initialized\n\n";
}
HoodieTimer timer = new HoodieTimer().startTimer();
FileStatus[] statuses = metaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
LOG.debug("Took " + timer.endTimer() + " ms");
final List<Comparable[]> rows = new ArrayList<>();
Arrays.stream(statuses).sorted((p1, p2) -> p2.getPath().getName().compareTo(p1.getPath().getName())).forEach(f -> {
Comparable[] row = new Comparable[1];
row[0] = f;
rows.add(row);
});
TableHeader header = new TableHeader().addTableHeaderField("file path");
return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.
the class MetadataCommand method stats.
@CliCommand(value = "metadata stats", help = "Print stats about the metadata")
public String stats() throws IOException {
HoodieCLI.getTableMetaClient();
HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
Map<String, String> stats = metadata.stats();
final List<Comparable[]> rows = new ArrayList<>();
for (Map.Entry<String, String> entry : stats.entrySet()) {
Comparable[] row = new Comparable[2];
row[0] = entry.getKey();
row[1] = entry.getValue();
rows.add(row);
}
TableHeader header = new TableHeader().addTableHeaderField("stat key").addTableHeaderField("stat value");
return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.
the class MetadataCommand method validateFiles.
@CliCommand(value = "metadata validate-files", help = "Validate all files in all partitions from the metadata")
public String validateFiles(@CliOption(key = { "verbose" }, help = "Print all file details", unspecifiedDefaultValue = "false") final boolean verbose) throws IOException {
HoodieCLI.getTableMetaClient();
HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
HoodieBackedTableMetadata metadataReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp");
if (!metadataReader.enabled()) {
return "[ERROR] Metadata Table not enabled/initialized\n\n";
}
HoodieMetadataConfig fsConfig = HoodieMetadataConfig.newBuilder().enable(false).build();
HoodieBackedTableMetadata fsMetaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), fsConfig, HoodieCLI.basePath, "/tmp");
HoodieTimer timer = new HoodieTimer().startTimer();
List<String> metadataPartitions = metadataReader.getAllPartitionPaths();
LOG.debug("Listing partitions Took " + timer.endTimer() + " ms");
List<String> fsPartitions = fsMetaReader.getAllPartitionPaths();
Collections.sort(fsPartitions);
Collections.sort(metadataPartitions);
Set<String> allPartitions = new HashSet<>();
allPartitions.addAll(fsPartitions);
allPartitions.addAll(metadataPartitions);
if (!fsPartitions.equals(metadataPartitions)) {
LOG.error("FS partition listing is not matching with metadata partition listing!");
LOG.error("All FS partitions: " + Arrays.toString(fsPartitions.toArray()));
LOG.error("All Metadata partitions: " + Arrays.toString(metadataPartitions.toArray()));
}
final List<Comparable[]> rows = new ArrayList<>();
for (String partition : allPartitions) {
Map<String, FileStatus> fileStatusMap = new HashMap<>();
Map<String, FileStatus> metadataFileStatusMap = new HashMap<>();
FileStatus[] metadataStatuses = metadataReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
Arrays.stream(metadataStatuses).forEach(entry -> metadataFileStatusMap.put(entry.getPath().getName(), entry));
FileStatus[] fsStatuses = fsMetaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition));
Arrays.stream(fsStatuses).forEach(entry -> fileStatusMap.put(entry.getPath().getName(), entry));
Set<String> allFiles = new HashSet<>();
allFiles.addAll(fileStatusMap.keySet());
allFiles.addAll(metadataFileStatusMap.keySet());
for (String file : allFiles) {
Comparable[] row = new Comparable[6];
row[0] = partition;
FileStatus fsFileStatus = fileStatusMap.get(file);
FileStatus metaFileStatus = metadataFileStatusMap.get(file);
boolean doesFsFileExists = fsFileStatus != null;
boolean doesMetadataFileExists = metaFileStatus != null;
long fsFileLength = doesFsFileExists ? fsFileStatus.getLen() : 0;
long metadataFileLength = doesMetadataFileExists ? metaFileStatus.getLen() : 0;
row[1] = file;
row[2] = doesFsFileExists;
row[3] = doesMetadataFileExists;
row[4] = fsFileLength;
row[5] = metadataFileLength;
if (verbose) {
// if verbose print all files
rows.add(row);
} else if ((doesFsFileExists != doesMetadataFileExists) || (fsFileLength != metadataFileLength)) {
// if non verbose, print only non matching files
rows.add(row);
}
}
if (metadataStatuses.length != fsStatuses.length) {
LOG.error(" FS and metadata files count not matching for " + partition + ". FS files count " + fsStatuses.length + ", metadata base files count " + metadataStatuses.length);
}
for (Map.Entry<String, FileStatus> entry : fileStatusMap.entrySet()) {
if (!metadataFileStatusMap.containsKey(entry.getKey())) {
LOG.error("FS file not found in metadata " + entry.getKey());
} else {
if (entry.getValue().getLen() != metadataFileStatusMap.get(entry.getKey()).getLen()) {
LOG.error(" FS file size mismatch " + entry.getKey() + ", size equality " + (entry.getValue().getLen() == metadataFileStatusMap.get(entry.getKey()).getLen()) + ". FS size " + entry.getValue().getLen() + ", metadata size " + metadataFileStatusMap.get(entry.getKey()).getLen());
}
}
}
for (Map.Entry<String, FileStatus> entry : metadataFileStatusMap.entrySet()) {
if (!fileStatusMap.containsKey(entry.getKey())) {
LOG.error("Metadata file not found in FS " + entry.getKey());
} else {
if (entry.getValue().getLen() != fileStatusMap.get(entry.getKey()).getLen()) {
LOG.error(" Metadata file size mismatch " + entry.getKey() + ", size equality " + (entry.getValue().getLen() == fileStatusMap.get(entry.getKey()).getLen()) + ". Metadata size " + entry.getValue().getLen() + ", FS size " + metadataFileStatusMap.get(entry.getKey()).getLen());
}
}
}
}
TableHeader header = new TableHeader().addTableHeaderField("Partition").addTableHeaderField("File Name").addTableHeaderField(" Is Present in FS ").addTableHeaderField(" Is Present in Metadata").addTableHeaderField(" FS size").addTableHeaderField(" Metadata size");
return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.
the class HoodieCopyOnWriteTableInputFormat method listStatusForSnapshotMode.
@Nonnull
private List<FileStatus> listStatusForSnapshotMode(JobConf job, Map<String, HoodieTableMetaClient> tableMetaClientMap, List<Path> snapshotPaths) throws IOException {
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(job);
List<FileStatus> targetFiles = new ArrayList<>();
TypedProperties props = new TypedProperties(new Properties());
Map<HoodieTableMetaClient, List<Path>> groupedPaths = HoodieInputFormatUtils.groupSnapshotPathsByMetaClient(tableMetaClientMap.values(), snapshotPaths);
for (Map.Entry<HoodieTableMetaClient, List<Path>> entry : groupedPaths.entrySet()) {
HoodieTableMetaClient tableMetaClient = entry.getKey();
List<Path> partitionPaths = entry.getValue();
// Hive job might specify a max commit instant up to which table's state
// should be examined. We simply pass it as query's instant to the file-index
Option<String> queryCommitInstant = HoodieHiveUtils.getMaxCommit(job, tableMetaClient.getTableConfig().getTableName());
boolean shouldIncludePendingCommits = HoodieHiveUtils.shouldIncludePendingCommits(job, tableMetaClient.getTableConfig().getTableName());
HiveHoodieTableFileIndex fileIndex = new HiveHoodieTableFileIndex(engineContext, tableMetaClient, props, HoodieTableQueryType.SNAPSHOT, partitionPaths, queryCommitInstant, shouldIncludePendingCommits);
Map<String, List<FileSlice>> partitionedFileSlices = fileIndex.listFileSlices();
Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient);
targetFiles.addAll(partitionedFileSlices.values().stream().flatMap(Collection::stream).map(fileSlice -> createFileStatusUnchecked(fileSlice, fileIndex, virtualKeyInfoOpt)).collect(Collectors.toList()));
}
return targetFiles;
}
use of org.apache.hudi.common.engine.HoodieLocalEngineContext in project hudi by apache.
the class HoodieROTablePathFilter method accept.
@Override
public boolean accept(Path path) {
if (engineContext == null) {
this.engineContext = new HoodieLocalEngineContext(this.conf.get());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Checking acceptance for path " + path);
}
Path folder = null;
try {
if (fs == null) {
fs = path.getFileSystem(conf.get());
}
// Assumes path is a file
// get the immediate parent.
folder = path.getParent();
// Try to use the caches.
if (nonHoodiePathCache.contains(folder.toString())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Accepting non-hoodie path from cache: " + path);
}
return true;
}
if (hoodiePathCache.containsKey(folder.toString())) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("%s Hoodie path checked against cache, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
}
return hoodiePathCache.get(folder.toString()).contains(path);
}
// Skip all files that are descendants of .hoodie in its path.
String filePath = path.toString();
if (filePath.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/") || filePath.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME)) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Skipping Hoodie Metadata file %s \n", filePath));
}
return false;
}
// Perform actual checking.
Path baseDir;
if (HoodiePartitionMetadata.hasPartitionMetadata(fs, folder)) {
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, folder);
metadata.readFromFS();
baseDir = HoodieHiveUtils.getNthParent(folder, metadata.getPartitionDepth());
} else {
baseDir = safeGetParentsParent(folder);
}
if (baseDir != null) {
// Check whether baseDir in nonHoodiePathCache
if (nonHoodiePathCache.contains(baseDir.toString())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Accepting non-hoodie path from cache: " + path);
}
return true;
}
HoodieTableFileSystemView fsView = null;
try {
HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString());
if (null == metaClient) {
metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).setLoadActiveTimelineOnLoad(true).build();
metaClientCache.put(baseDir.toString(), metaClient);
}
fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf()));
String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
// populate the cache
if (!hoodiePathCache.containsKey(folder.toString())) {
hoodiePathCache.put(folder.toString(), new HashSet<>());
}
LOG.info("Based on hoodie metadata from base path: " + baseDir.toString() + ", caching " + latestFiles.size() + " files under " + folder);
for (HoodieBaseFile lfile : latestFiles) {
hoodiePathCache.get(folder.toString()).add(new Path(lfile.getPath()));
}
// accept the path, if its among the latest files.
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("%s checked after cache population, accept => %s \n", path, hoodiePathCache.get(folder.toString()).contains(path)));
}
return hoodiePathCache.get(folder.toString()).contains(path);
} catch (TableNotFoundException e) {
// Non-hoodie path, accept it.
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("(1) Caching non-hoodie path under %s with basePath %s \n", folder.toString(), baseDir.toString()));
}
nonHoodiePathCache.add(folder.toString());
nonHoodiePathCache.add(baseDir.toString());
return true;
} finally {
if (fsView != null) {
fsView.close();
}
}
} else {
// files is at < 3 level depth in FS tree, can't be hoodie dataset
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("(2) Caching non-hoodie path under %s \n", folder.toString()));
}
nonHoodiePathCache.add(folder.toString());
return true;
}
} catch (Exception e) {
String msg = "Error checking path :" + path + ", under folder: " + folder;
LOG.error(msg, e);
throw new HoodieException(msg, e);
}
}
Aggregations