use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.
the class HoodieClientTestHarness method validateMetadata.
/**
* Validate the metadata tables contents to ensure it matches what is on the file system.
*/
public void validateMetadata(HoodieTestTable testTable, List<String> inflightCommits, HoodieWriteConfig writeConfig, String metadataTableBasePath, boolean doFullValidation) throws IOException {
HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
assertNotNull(tableMetadata, "MetadataReader should have been initialized");
if (!writeConfig.isMetadataTableEnabled()) {
return;
}
if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) {
throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + "of FileSystemBackedTableMetadata");
}
assertEquals(inflightCommits, testTable.inflightCommits());
HoodieTimer timer = new HoodieTimer().startTimer();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Partitions should match
List<java.nio.file.Path> fsPartitionPaths = testTable.getAllPartitionPaths();
List<String> fsPartitions = new ArrayList<>();
fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString()));
if (fsPartitions.isEmpty()) {
fsPartitions.add("");
}
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
Collections.sort(fsPartitions);
Collections.sort(metadataPartitions);
assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
assertEquals(fsPartitions, metadataPartitions, "Partitions should match");
// Files within each partition should match
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext, true);
TableFileSystemView tableView = table.getHoodieView();
List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
assertEquals(fsPartitions.size(), partitionToFilesMap.size());
fsPartitions.forEach(partition -> {
try {
validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition);
} catch (IOException e) {
fail("Exception should not be raised: " + e);
}
});
if (doFullValidation) {
runFullValidation(writeConfig, metadataTableBasePath, engineContext);
}
LOG.info("Validation time=" + timer.endTimer());
}
use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.
the class HoodieClientTestHarness method validateFilesPerPartition.
protected void validateFilesPerPartition(HoodieTestTable testTable, HoodieTableMetadata tableMetadata, TableFileSystemView tableView, Map<String, FileStatus[]> partitionToFilesMap, String partition) throws IOException {
Path partitionPath;
if (partition.equals("")) {
// Should be the non-partitioned case
partitionPath = new Path(basePath);
} else {
partitionPath = new Path(basePath, partition);
}
FileStatus[] fsStatuses = testTable.listAllFilesInPartition(partition);
FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath);
List<String> fsFileNames = Arrays.stream(fsStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
List<String> metadataFilenames = Arrays.stream(metaStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
Collections.sort(fsFileNames);
Collections.sort(metadataFilenames);
assertLinesMatch(fsFileNames, metadataFilenames);
assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length);
// Block sizes should be valid
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0));
List<Long> fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList());
List<Long> metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList());
assertEquals(fsBlockSizes, metadataBlockSizes);
assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match");
assertEquals(fsFileNames, metadataFilenames, "Files within partition " + partition + " should match");
// FileSystemView should expose the same data
List<HoodieFileGroup> fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList());
fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList()));
fileGroups.forEach(g -> LogManager.getLogger(getClass()).info(g));
fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(getClass()).info(b)));
fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(getClass()).info(s)));
long numFiles = fileGroups.stream().mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()).sum();
assertEquals(metadataFilenames.size(), numFiles);
}
use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.
the class TestHoodieBackedMetadata method testMetadataTableWithPendingCompaction.
/**
* Tests that virtual key configs are honored in base files after compaction in metadata table.
*/
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMetadataTableWithPendingCompaction(boolean simulateFailedCompaction) throws Exception {
HoodieTableType tableType = COPY_ON_WRITE;
init(tableType, false);
writeConfig = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).enableFullScan(true).enableMetrics(false).withMaxNumDeltaCommitsBeforeCompaction(3).build()).build();
initWriteConfigAndMetatableWriter(writeConfig, true);
doWriteOperation(testTable, "0000001", INSERT);
// create an inflight compaction in metadata table.
// not easy to create an inflight in metadata table directly, hence letting compaction succeed and then deleting the completed instant.
// this new write is expected to trigger metadata table compaction
String commitInstant = "0000002";
doWriteOperation(testTable, commitInstant, INSERT);
doWriteOperation(testTable, "0000003", INSERT);
HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
String metadataCompactionInstant = commitInstant + "001";
assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
assertEquals(tableMetadata.getLatestCompactionTime().get(), metadataCompactionInstant);
validateMetadata(testTable);
// Fetch compaction Commit file and rename to some other file. completed compaction meta file should have some serialized info that table interprets
// for future upserts. so, renaming the file here to some temp name and later renaming it back to same name.
java.nio.file.Path parentPath = Paths.get(metadataTableBasePath, HoodieTableMetaClient.METAFOLDER_NAME);
java.nio.file.Path metaFilePath = parentPath.resolve(metadataCompactionInstant + HoodieTimeline.COMMIT_EXTENSION);
java.nio.file.Path tempFilePath = FileCreateUtils.renameFileToTemp(metaFilePath, metadataCompactionInstant);
metaClient.reloadActiveTimeline();
testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
// this validation will exercise the code path where a compaction is inflight in metadata table, but still metadata based file listing should match non
// metadata based file listing.
validateMetadata(testTable);
if (simulateFailedCompaction) {
// this should retry the compaction in metadata table.
doWriteOperation(testTable, "0000004", INSERT);
} else {
// let the compaction succeed in metadata and validation should succeed.
FileCreateUtils.renameTempToMetaFile(tempFilePath, metaFilePath);
}
validateMetadata(testTable);
// add few more write and validate
doWriteOperation(testTable, "0000005", INSERT);
doWriteOperation(testTable, "0000006", UPSERT);
validateMetadata(testTable);
if (simulateFailedCompaction) {
// trigger another compaction failure.
metadataCompactionInstant = "0000005001";
tableMetadata = metadata(writeConfig, context);
assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
assertEquals(tableMetadata.getLatestCompactionTime().get(), metadataCompactionInstant);
// Fetch compaction Commit file and rename to some other file. completed compaction meta file should have some serialized info that table interprets
// for future upserts. so, renaming the file here to some temp name and later renaming it back to same name.
parentPath = Paths.get(metadataTableBasePath, HoodieTableMetaClient.METAFOLDER_NAME);
metaFilePath = parentPath.resolve(metadataCompactionInstant + HoodieTimeline.COMMIT_EXTENSION);
tempFilePath = FileCreateUtils.renameFileToTemp(metaFilePath, metadataCompactionInstant);
validateMetadata(testTable);
// this should retry the failed compaction in metadata table.
doWriteOperation(testTable, "0000007", INSERT);
validateMetadata(testTable);
// add few more write and validate
doWriteOperation(testTable, "0000008", INSERT);
doWriteOperation(testTable, "0000009", UPSERT);
validateMetadata(testTable);
}
}
use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.
the class HoodieTimelineArchiver method getInstantsToArchive.
private Stream<HoodieInstant> getInstantsToArchive() {
Stream<HoodieInstant> instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive());
// For archiving and cleaning instants, we need to include intermediate state files if they exist
HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
Map<Pair<String, String>, List<HoodieInstant>> groupByTsAction = rawActiveTimeline.getInstants().collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(), HoodieInstant.getComparableAction(i.getAction()))));
// metadata table.
if (config.isMetadataTableEnabled()) {
try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(), config.getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) {
Option<String> latestCompactionTime = tableMetadata.getLatestCompactionTime();
if (!latestCompactionTime.isPresent()) {
LOG.info("Not archiving as there is no compaction yet on the metadata table");
instants = Stream.empty();
} else {
LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get());
instants = instants.filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, latestCompactionTime.get()));
}
} catch (Exception e) {
throw new HoodieException("Error limiting instant archival based on metadata table", e);
}
}
return instants.flatMap(hoodieInstant -> groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(), HoodieInstant.getComparableAction(hoodieInstant.getAction()))).stream());
}
use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.
the class HoodieDataTableValidator method doDataTableValidation.
public void doDataTableValidation() {
boolean finalResult = true;
metaClient.reloadActiveTimeline();
String basePath = metaClient.getBasePath();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try {
HoodieTableMetadata tableMetadata = new FileSystemBackedTableMetadata(engineContext, engineContext.getHadoopConf(), cfg.basePath, cfg.assumeDatePartitioning);
List<Path> allDataFilePaths = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
// verify that no data files present with commit time < earliest commit in active timeline.
if (metaClient.getActiveTimeline().firstInstant().isPresent()) {
String earliestInstant = metaClient.getActiveTimeline().firstInstant().get().getTimestamp();
List<Path> danglingFilePaths = allDataFilePaths.stream().filter(path -> {
String instantTime = FSUtils.getCommitTime(path.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN, earliestInstant);
}).collect(Collectors.toList());
if (!danglingFilePaths.isEmpty() && danglingFilePaths.size() > 0) {
LOG.error("Data table validation failed due to dangling files count " + danglingFilePaths.size() + ", found before active timeline");
danglingFilePaths.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
finalResult = false;
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFilePaths.size());
}
}
// Verify that for every completed commit in active timeline, there are no extra files found apart from what is present in
// commit metadata.
Map<String, List<String>> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(metaClient.getBasePath(), allDataFilePaths);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
List<HoodieInstant> hoodieInstants = activeTimeline.filterCompletedInstants().getInstants().collect(Collectors.toList());
List<String> danglingFiles = engineContext.flatMap(hoodieInstants, instant -> {
Option<Set<String>> filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline(activeTimeline, instant);
List<String> baseAndLogFilesFromFs = instantToFilesMap.containsKey(instant.getTimestamp()) ? instantToFilesMap.get(instant.getTimestamp()) : Collections.emptyList();
if (!baseAndLogFilesFromFs.isEmpty()) {
Set<String> danglingInstantFiles = new HashSet<>(baseAndLogFilesFromFs);
if (filesFromTimeline.isPresent()) {
danglingInstantFiles.removeAll(filesFromTimeline.get());
}
return new ArrayList<>(danglingInstantFiles).stream();
} else {
return Stream.empty();
}
}, hoodieInstants.size()).stream().collect(Collectors.toList());
if (!danglingFiles.isEmpty()) {
LOG.error("Data table validation failed due to extra files found for completed commits " + danglingFiles.size());
danglingFiles.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
finalResult = false;
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFiles.size());
}
}
}
} catch (Exception e) {
LOG.error("Data table validation failed due to " + e.getMessage(), e);
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to " + e.getMessage(), e);
}
}
if (finalResult) {
LOG.info("Data table validation succeeded.");
} else {
LOG.warn("Data table validation failed.");
}
}
Aggregations