Search in sources :

Example 1 with ExecutionContext

use of org.apache.hudi.integ.testsuite.dag.ExecutionContext in project hudi by apache.

the class ValidateAsyncOperations method execute.

@Override
public void execute(ExecutionContext executionContext, int curItrCount) throws Exception {
    if (config.getIterationCountToExecute() == curItrCount) {
        try {
            log.warn("Executing ValidateHoodieAsyncOperations node {} with target base path {} ", this.getName(), executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
            String basePath = executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath;
            int maxCommitsRetained = executionContext.getHoodieTestSuiteWriter().getWriteConfig().getCleanerCommitsRetained() + 1;
            FileSystem fs = FSUtils.getFs(basePath, executionContext.getHoodieTestSuiteWriter().getConfiguration());
            HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath).setConf(executionContext.getJsc().hadoopConfiguration()).build();
            Option<HoodieInstant> latestCleanInstant = metaClient.getActiveTimeline().filter(instant -> instant.getAction().equals(HoodieTimeline.CLEAN_ACTION)).lastInstant();
            if (latestCleanInstant.isPresent()) {
                log.warn("Latest clean commit " + latestCleanInstant.get());
                HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, latestCleanInstant.get());
                String earliestCommitToRetain = cleanMetadata.getEarliestCommitToRetain();
                log.warn("Earliest commit to retain : " + earliestCommitToRetain);
                long unCleanedInstants = metaClient.getActiveTimeline().filterCompletedInstants().filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, earliestCommitToRetain)).getInstants().count();
                ValidationUtils.checkArgument(unCleanedInstants >= (maxCommitsRetained + 1), "Total uncleaned instants " + unCleanedInstants + " mismatched with max commits retained " + (maxCommitsRetained + 1));
            }
            if (config.validateArchival() || config.validateClean()) {
                final Pattern ARCHIVE_FILE_PATTERN = Pattern.compile("\\.commits_\\.archive\\..*");
                final Pattern CLEAN_FILE_PATTERN = Pattern.compile(".*\\.clean\\..*");
                String metadataPath = executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/.hoodie";
                FileStatus[] metaFileStatuses = fs.listStatus(new Path(metadataPath));
                boolean cleanFound = false;
                for (FileStatus fileStatus : metaFileStatuses) {
                    Matcher cleanFileMatcher = CLEAN_FILE_PATTERN.matcher(fileStatus.getPath().getName());
                    if (cleanFileMatcher.matches()) {
                        cleanFound = true;
                        break;
                    }
                }
                String archivalPath = executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/.hoodie/archived";
                metaFileStatuses = fs.listStatus(new Path(archivalPath));
                boolean archFound = false;
                for (FileStatus fileStatus : metaFileStatuses) {
                    Matcher archFileMatcher = ARCHIVE_FILE_PATTERN.matcher(fileStatus.getPath().getName());
                    if (archFileMatcher.matches()) {
                        archFound = true;
                    }
                }
                if (config.validateArchival() && !archFound) {
                    throw new AssertionError("Archival NotFound in " + metadataPath);
                }
                if (config.validateClean() && !cleanFound) {
                    throw new AssertionError("Clean commits NotFound in " + metadataPath);
                }
            }
        } catch (Exception e) {
            log.warn("Exception thrown in ValidateHoodieAsyncOperations Node :: " + e.getCause() + ", msg :: " + e.getMessage());
            throw e;
        }
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) ExecutionContext(org.apache.hudi.integ.testsuite.dag.ExecutionContext) Arrays(java.util.Arrays) Logger(org.slf4j.Logger) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) Collectors(java.util.stream.Collectors) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Matcher(java.util.regex.Matcher) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Config(org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) FileStatus(org.apache.hadoop.fs.FileStatus) Matcher(java.util.regex.Matcher) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 2 with ExecutionContext

use of org.apache.hudi.integ.testsuite.dag.ExecutionContext in project hudi by apache.

the class BaseValidateDatasetNode method getInputDf.

private Dataset<Row> getInputDf(ExecutionContext context, SparkSession session, String inputPath) {
    String recordKeyField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.RECORDKEY_FIELD().key());
    String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key());
    // todo: fix hard coded fields from configs.
    // read input and resolve insert, updates, etc.
    Dataset<Row> inputDf = session.read().format("avro").load(inputPath);
    ExpressionEncoder encoder = getEncoder(inputDf.schema());
    return inputDf.groupByKey((MapFunction<Row, String>) value -> value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField), Encoders.STRING()).reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
        int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD);
        int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD);
        if (ts1 > ts2) {
            return v1;
        } else {
            return v2;
        }
    }).map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder).filter("_hoodie_is_deleted != true");
}
Also used : Attribute(org.apache.spark.sql.catalyst.expressions.Attribute) StructType(org.apache.spark.sql.types.StructType) ExecutionContext(org.apache.hudi.integ.testsuite.dag.ExecutionContext) JavaConversions(scala.collection.JavaConversions) Logger(org.slf4j.Logger) Dataset(org.apache.spark.sql.Dataset) FileSystem(org.apache.hadoop.fs.FileSystem) RowEncoder(org.apache.spark.sql.catalyst.encoders.RowEncoder) Row(org.apache.spark.sql.Row) FileStatus(org.apache.hadoop.fs.FileStatus) Tuple2(scala.Tuple2) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) ReduceFunction(org.apache.spark.api.java.function.ReduceFunction) Collectors(java.util.stream.Collectors) Encoders(org.apache.spark.sql.Encoders) ExpressionEncoder(org.apache.spark.sql.catalyst.encoders.ExpressionEncoder) List(java.util.List) SchemaUtils(org.apache.hudi.integ.testsuite.schema.SchemaUtils) SimpleAnalyzer$(org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$) DeltaConfig(org.apache.hudi.integ.testsuite.configuration.DeltaConfig) JavaConverters(scala.collection.JavaConverters) Path(org.apache.hadoop.fs.Path) MapFunction(org.apache.spark.api.java.function.MapFunction) SparkSession(org.apache.spark.sql.SparkSession) ExpressionEncoder(org.apache.spark.sql.catalyst.encoders.ExpressionEncoder) Row(org.apache.spark.sql.Row) MapFunction(org.apache.spark.api.java.function.MapFunction)

Aggregations

List (java.util.List)2 Collectors (java.util.stream.Collectors)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 ExecutionContext (org.apache.hudi.integ.testsuite.dag.ExecutionContext)2 Logger (org.slf4j.Logger)2 Arrays (java.util.Arrays)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 DataSourceWriteOptions (org.apache.hudi.DataSourceWriteOptions)1 HoodieCleanMetadata (org.apache.hudi.avro.model.HoodieCleanMetadata)1 FSUtils (org.apache.hudi.common.fs.FSUtils)1 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)1 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)1 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)1 CleanerUtils (org.apache.hudi.common.util.CleanerUtils)1