use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.
the class HoodieDataTableValidator method doDataTableValidation.
public void doDataTableValidation() {
boolean finalResult = true;
metaClient.reloadActiveTimeline();
String basePath = metaClient.getBasePath();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try {
HoodieTableMetadata tableMetadata = new FileSystemBackedTableMetadata(engineContext, engineContext.getHadoopConf(), cfg.basePath, cfg.assumeDatePartitioning);
List<Path> allDataFilePaths = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
// verify that no data files present with commit time < earliest commit in active timeline.
if (metaClient.getActiveTimeline().firstInstant().isPresent()) {
String earliestInstant = metaClient.getActiveTimeline().firstInstant().get().getTimestamp();
List<Path> danglingFilePaths = allDataFilePaths.stream().filter(path -> {
String instantTime = FSUtils.getCommitTime(path.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN, earliestInstant);
}).collect(Collectors.toList());
if (!danglingFilePaths.isEmpty() && danglingFilePaths.size() > 0) {
LOG.error("Data table validation failed due to dangling files count " + danglingFilePaths.size() + ", found before active timeline");
danglingFilePaths.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
finalResult = false;
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFilePaths.size());
}
}
// Verify that for every completed commit in active timeline, there are no extra files found apart from what is present in
// commit metadata.
Map<String, List<String>> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(metaClient.getBasePath(), allDataFilePaths);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
List<HoodieInstant> hoodieInstants = activeTimeline.filterCompletedInstants().getInstants().collect(Collectors.toList());
List<String> danglingFiles = engineContext.flatMap(hoodieInstants, instant -> {
Option<Set<String>> filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline(activeTimeline, instant);
List<String> baseAndLogFilesFromFs = instantToFilesMap.containsKey(instant.getTimestamp()) ? instantToFilesMap.get(instant.getTimestamp()) : Collections.emptyList();
if (!baseAndLogFilesFromFs.isEmpty()) {
Set<String> danglingInstantFiles = new HashSet<>(baseAndLogFilesFromFs);
if (filesFromTimeline.isPresent()) {
danglingInstantFiles.removeAll(filesFromTimeline.get());
}
return new ArrayList<>(danglingInstantFiles).stream();
} else {
return Stream.empty();
}
}, hoodieInstants.size()).stream().collect(Collectors.toList());
if (!danglingFiles.isEmpty()) {
LOG.error("Data table validation failed due to extra files found for completed commits " + danglingFiles.size());
danglingFiles.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
finalResult = false;
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFiles.size());
}
}
}
} catch (Exception e) {
LOG.error("Data table validation failed due to " + e.getMessage(), e);
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to " + e.getMessage(), e);
}
}
if (finalResult) {
LOG.info("Data table validation succeeded.");
} else {
LOG.warn("Data table validation failed.");
}
}
use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testClusteringInvalidConfigForSqlQueryValidator.
@Test
public void testClusteringInvalidConfigForSqlQueryValidator() throws Exception {
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build();
try {
testInsertAndClustering(clusteringConfig, false, true, false, SqlQueryEqualityPreCommitValidator.class.getName(), "", "");
fail("expected pre-commit clustering validation to fail because sql query is not configured");
} catch (HoodieValidationException e) {
// expected
}
}
use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testPreCommitValidationFailureOnInsert.
@Test
public void testPreCommitValidationFailureOnInsert() throws Exception {
int numRecords = 200;
HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder().withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()).withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + 500).build();
HoodieWriteConfig config = getConfigBuilder().withPreCommitValidatorConfig(validatorConfig).build();
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = getHoodieWriteClient(config)) {
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn = (writeClient, recordRDD, instantTime) -> writeClient.bulkInsert(recordRDD, instantTime, Option.empty());
JavaRDD<WriteStatus> result = insertFirstBatch(config, client, newCommitTime, "000", numRecords, writeFn, false, false, numRecords);
fail("Expected validation to fail because we only insert 200 rows. Validation is configured to expect 500 rows");
} catch (HoodieInsertException e) {
if (e.getCause() instanceof HoodieValidationException) {
// expected because wrong value passed
} else {
throw e;
}
}
assertFalse(testTable.commitExists(newCommitTime));
}
use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testClusteringWithFailingValidator.
@Test
public void testClusteringWithFailingValidator() throws Exception {
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringSortColumns("_hoodie_record_key").withInlineClustering(true).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
try {
testInsertAndClustering(clusteringConfig, true, true, false, FailingPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, "");
fail("expected pre-commit clustering validation to fail");
} catch (HoodieValidationException e) {
// expected
}
}
Aggregations