Search in sources :

Example 1 with HoodieValidationException

use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.

the class HoodieTableFactory method setupHoodieKeyOptions.

/**
 * Sets up the hoodie key options (e.g. record key and partition key) from the table definition.
 */
private static void setupHoodieKeyOptions(Configuration conf, CatalogTable table) {
    List<String> pkColumns = table.getSchema().getPrimaryKey().map(UniqueConstraint::getColumns).orElse(Collections.emptyList());
    if (pkColumns.size() > 0) {
        // the PRIMARY KEY syntax always has higher priority than option FlinkOptions#RECORD_KEY_FIELD
        String recordKey = String.join(",", pkColumns);
        conf.setString(FlinkOptions.RECORD_KEY_FIELD, recordKey);
    }
    List<String> partitionKeys = table.getPartitionKeys();
    if (partitionKeys.size() > 0) {
        // the PARTITIONED BY syntax always has higher priority than option FlinkOptions#PARTITION_PATH_FIELD
        conf.setString(FlinkOptions.PARTITION_PATH_FIELD, String.join(",", partitionKeys));
    }
    // set index key for bucket index if not defined
    if (conf.getString(FlinkOptions.INDEX_TYPE).equals(HoodieIndex.IndexType.BUCKET.name()) && conf.getString(FlinkOptions.INDEX_KEY_FIELD).isEmpty()) {
        conf.setString(FlinkOptions.INDEX_KEY_FIELD, conf.getString(FlinkOptions.RECORD_KEY_FIELD));
    }
    // tweak the key gen class if possible
    final String[] partitions = conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",");
    final String[] pks = conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",");
    if (partitions.length == 1) {
        final String partitionField = partitions[0];
        if (partitionField.isEmpty()) {
            conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName());
            LOG.info("Table option [{}] is reset to {} because this is a non-partitioned table", FlinkOptions.KEYGEN_CLASS_NAME.key(), NonpartitionedAvroKeyGenerator.class.getName());
            return;
        }
        DataType partitionFieldType = table.getSchema().getFieldDataType(partitionField).orElseThrow(() -> new HoodieValidationException("Field " + partitionField + " does not exist"));
        if (pks.length <= 1 && DataTypeUtils.isDatetimeType(partitionFieldType)) {
            // timestamp based key gen only supports simple primary key
            setupTimestampKeygenOptions(conf, partitionFieldType);
            return;
        }
    }
    boolean complexHoodieKey = pks.length > 1 || partitions.length > 1;
    if (complexHoodieKey && FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.KEYGEN_CLASS_NAME)) {
        conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, ComplexAvroKeyGenerator.class.getName());
        LOG.info("Table option [{}] is reset to {} because record key or partition path has two or more fields", FlinkOptions.KEYGEN_CLASS_NAME.key(), ComplexAvroKeyGenerator.class.getName());
    }
}
Also used : HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) DataType(org.apache.flink.table.types.DataType) NonpartitionedAvroKeyGenerator(org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator) ComplexAvroKeyGenerator(org.apache.hudi.keygen.ComplexAvroKeyGenerator)

Example 2 with HoodieValidationException

use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.

the class SparkValidatorUtils method runValidators.

/**
 * Check configured pre-commit validators and run them. Note that this only works for COW tables
 *
 * Throw error if there are validation failures.
 */
public static void runValidators(HoodieWriteConfig config, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata, HoodieEngineContext context, HoodieTable table, String instantTime) {
    if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
        LOG.info("no validators configured.");
    } else {
        if (!writeMetadata.getWriteStats().isPresent()) {
            writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collectAsList());
        }
        Set<String> partitionsModified = writeMetadata.getWriteStats().get().stream().map(writeStats -> writeStats.getPartitionPath()).collect(Collectors.toSet());
        SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
        // Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
        table.getMetaClient().reloadActiveTimeline();
        Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache();
        Dataset<Row> afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache();
        Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(",")).map(validatorClass -> {
            return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass, new Class<?>[] { HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class }, table, context, config));
        });
        boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join).reduce(true, Boolean::logicalAnd);
        if (allSuccess) {
            LOG.info("All validations succeeded");
        } else {
            LOG.error("At least one pre-commit validation failed");
            throw new HoodieValidationException("At least one pre-commit validation failed");
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SQLContext(org.apache.spark.sql.SQLContext) Set(java.util.Set) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) List(java.util.List) Stream(java.util.stream.Stream) HoodieTablePreCommitFileSystemView(org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView) JavaConverters(scala.collection.JavaConverters) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) Row(org.apache.spark.sql.Row) WriteStatus(org.apache.hudi.client.WriteStatus) SQLContext(org.apache.spark.sql.SQLContext)

Example 3 with HoodieValidationException

use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testClusteringInvalidConfigForSqlQuerySingleResultValidatorFailure.

@Test
public void testClusteringInvalidConfigForSqlQuerySingleResultValidatorFailure() throws Exception {
    // setup clustering config.
    HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build();
    try {
        testInsertAndClustering(clusteringConfig, false, true, false, SqlQuerySingleResultPreCommitValidator.class.getName(), "", COUNT_SQL_QUERY_FOR_VALIDATION + "#802");
        fail("expected pre-commit clustering validation to fail because of count mismatch. expect 400 rows, not 802");
    } catch (HoodieValidationException e) {
    // expected
    }
}
Also used : HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) SqlQuerySingleResultPreCommitValidator(org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieValidationException

use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testPreCommitValidationWithMultipleInflights.

@Test
public void testPreCommitValidationWithMultipleInflights() throws Exception {
    int numRecords = 200;
    HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder().withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()).withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + 500).build();
    HoodieWriteConfig config = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).build()).withPreCommitValidatorConfig(validatorConfig).build();
    String instant1 = HoodieActiveTimeline.createNewInstantTime();
    try {
        insertWithConfig(config, numRecords, instant1);
        fail("Expected validation to fail because we only insert 200 rows. Validation is configured to expect 500 rows");
    } catch (HoodieInsertException e) {
        if (e.getCause() instanceof HoodieValidationException) {
        // expected because wrong value passed
        } else {
            throw e;
        }
    }
    assertFalse(testTable.commitExists(instant1));
    assertTrue(testTable.inflightCommitExists(instant1));
    numRecords = 300;
    validatorConfig = HoodiePreCommitValidatorConfig.newBuilder().withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()).withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + numRecords).build();
    config = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).build()).withPreCommitValidatorConfig(validatorConfig).build();
    String instant2 = HoodieActiveTimeline.createNewInstantTime();
    // expect pre-commit validators to succeed. Note that validator is expected to exclude inflight instant1
    insertWithConfig(config, numRecords, instant2);
    assertTrue(testTable.inflightCommitExists(instant1));
    assertTrue(testTable.commitExists(instant2));
}
Also used : HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) SqlQuerySingleResultPreCommitValidator(org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator) HoodiePreCommitValidatorConfig(org.apache.hudi.config.HoodiePreCommitValidatorConfig) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 5 with HoodieValidationException

use of org.apache.hudi.exception.HoodieValidationException in project hudi by apache.

the class HoodieMetadataTableValidator method doMetadataTableValidation.

public void doMetadataTableValidation() {
    boolean finalResult = true;
    metaClient.reloadActiveTimeline();
    String basePath = metaClient.getBasePath();
    Set<String> baseFilesForCleaning = Collections.emptySet();
    if (cfg.skipDataFilesForCleaning) {
        HoodieTimeline inflightCleaningTimeline = metaClient.getActiveTimeline().getCleanerTimeline().filterInflights();
        baseFilesForCleaning = inflightCleaningTimeline.getInstants().flatMap(instant -> {
            try {
                // convert inflight instant to requested and get clean plan
                instant = new HoodieInstant(HoodieInstant.State.REQUESTED, instant.getAction(), instant.getTimestamp());
                HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(metaClient, instant);
                return cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().flatMap(cleanerFileInfoList -> {
                    return cleanerFileInfoList.stream().map(fileInfo -> {
                        return new Path(fileInfo.getFilePath()).getName();
                    });
                });
            } catch (IOException e) {
                throw new HoodieIOException("Error reading cleaner metadata for " + instant);
            }
        // only take care of base files here.
        }).filter(path -> {
            String fileExtension = FSUtils.getFileExtension(path);
            return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(fileExtension);
        }).collect(Collectors.toSet());
    }
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    List<String> allPartitions = validatePartitions(engineContext, basePath);
    HoodieMetadataValidationContext metadataTableBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, true);
    HoodieMetadataValidationContext fsBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, false);
    Set<String> finalBaseFilesForCleaning = baseFilesForCleaning;
    List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
        try {
            validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
            LOG.info("Metadata table validation succeeded for " + partitionPath);
            return true;
        } catch (HoodieValidationException e) {
            LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e);
            if (!cfg.ignoreFailed) {
                throw e;
            }
            return false;
        }
    }).collectAsList();
    for (Boolean res : result) {
        finalResult &= res;
    }
    if (finalResult) {
        LOG.info("Metadata table validation succeeded.");
    } else {
        LOG.warn("Metadata table validation failed.");
    }
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) ByteBuffer(java.nio.ByteBuffer) BloomFilterData(org.apache.hudi.utilities.util.BloomFilterData) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) Set(java.util.Set) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) FileSystemViewManager(org.apache.hudi.common.table.view.FileSystemViewManager) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Parameter(com.beust.jcommander.Parameter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieAsyncService(org.apache.hudi.async.HoodieAsyncService) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) ExecutorService(java.util.concurrent.ExecutorService) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) Log(jline.internal.Log) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan)

Aggregations

HoodieValidationException (org.apache.hudi.exception.HoodieValidationException)9 Test (org.junit.jupiter.api.Test)5 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)5 List (java.util.List)4 Set (java.util.Set)4 Collectors (java.util.stream.Collectors)4 HoodieClusteringConfig (org.apache.hudi.config.HoodieClusteringConfig)4 LogManager (org.apache.log4j.LogManager)4 Logger (org.apache.log4j.Logger)4 ArrayList (java.util.ArrayList)3 Collections (java.util.Collections)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 ExecutorService (java.util.concurrent.ExecutorService)3 Executors (java.util.concurrent.Executors)3 Stream (java.util.stream.Stream)3 Path (org.apache.hadoop.fs.Path)3 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)3 TypedProperties (org.apache.hudi.common.config.TypedProperties)3 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)3 FSUtils (org.apache.hudi.common.fs.FSUtils)3