Search in sources :

Example 96 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TableSchemaResolver method getTableParquetSchemaFromDataFile.

/**
 * Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
 * commit. We will assume that the schema has not changed within a single atomic write.
 *
 * @return Parquet schema for this table
 */
private MessageType getTableParquetSchemaFromDataFile() {
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
    Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData();
    try {
        switch(metaClient.getTableType()) {
            case COPY_ON_WRITE:
                // For COW table, the file has data written must be in parquet or orc format currently.
                if (instantAndCommitMetadata.isPresent()) {
                    HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
                    String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
                    return readSchemaFromBaseFile(filePath);
                } else {
                    throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
                }
            case MERGE_ON_READ:
                // Determine the file format based on the file name, and then extract schema from it.
                if (instantAndCommitMetadata.isPresent()) {
                    HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
                    String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
                    if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
                        // this is a log file
                        return readSchemaFromLogFile(new Path(filePath));
                    } else {
                        return readSchemaFromBaseFile(filePath);
                    }
                } else {
                    throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
                }
            default:
                LOG.error("Unknown table type " + metaClient.getTableType());
                throw new InvalidTableException(metaClient.getBasePath());
        }
    } catch (IOException e) {
        throw new HoodieException("Failed to read data schema", e);
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) InvalidTableException(org.apache.hudi.exception.InvalidTableException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) Pair(org.apache.hudi.common.util.collection.Pair)

Example 97 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TableSchemaResolver method getTableSchemaFromCommitMetadata.

/**
 * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the instant.
 *
 * @return Avro schema for this table
 */
private Option<Schema> getTableSchemaFromCommitMetadata(HoodieInstant instant, boolean includeMetadataFields) {
    try {
        HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
        byte[] data = timeline.getInstantDetails(instant).get();
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
        String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
        if (StringUtils.isNullOrEmpty(existingSchemaStr)) {
            return Option.empty();
        }
        Schema schema = new Schema.Parser().parse(existingSchemaStr);
        if (includeMetadataFields) {
            schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField);
        }
        return Option.of(schema);
    } catch (Exception e) {
        throw new HoodieException("Failed to read schema from commit metadata", e);
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) InvalidTableException(org.apache.hudi.exception.InvalidTableException)

Example 98 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TimelineUtils method getMetadataValue.

private static Option<String> getMetadataValue(HoodieTableMetaClient metaClient, String extraMetadataKey, HoodieInstant instant) {
    try {
        LOG.info("reading checkpoint info for:" + instant + " key: " + extraMetadataKey);
        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(metaClient.getCommitsTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class);
        return Option.ofNullable(commitMetadata.getExtraMetadata().get(extraMetadataKey));
    } catch (IOException e) {
        throw new HoodieIOException("Unable to parse instant metadata " + instant, e);
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 99 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HoodieTestSuiteJob method getSchemaVersionFromCommit.

int getSchemaVersionFromCommit(int nthCommit) throws Exception {
    int version = 0;
    try {
        HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitsTimeline();
        // Pickup the schema version from nth commit from last (most recent insert/upsert will be rolled back).
        HoodieInstant prevInstant = timeline.nthFromLastInstant(nthCommit).get();
        HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(prevInstant).get(), HoodieCommitMetadata.class);
        Map<String, String> extraMetadata = commit.getExtraMetadata();
        String avroSchemaStr = extraMetadata.get(HoodieCommitMetadata.SCHEMA_KEY);
        Schema avroSchema = new Schema.Parser().parse(avroSchemaStr);
        version = Integer.parseInt(avroSchema.getObjectProp("schemaVersion").toString());
        // DAG will generate & ingest data for 2 versions (n-th version being validated, n-1).
        log.info(String.format("Last used schemaVersion from latest commit file was %d. Optimizing the DAG.", version));
    } catch (Exception e) {
        // failed to open the commit to read schema version.
        // continue executing the DAG without any changes.
        log.info("Last used schemaVersion could not be validated from commit file.  Skipping SaferSchema Optimization.");
    }
    return version;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException)

Example 100 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HoodieTestSuiteWriter method commitCompaction.

public void commitCompaction(JavaRDD<WriteStatus> records, JavaRDD<DeltaWriteStats> generatedDataStats, Option<String> instantTime) throws IOException {
    if (!cfg.useDeltaStreamer) {
        Map<String, String> extraMetadata = new HashMap<>();
        /**
         * Store the checkpoint in the commit metadata just like
         * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} *
         */
        extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
        if (generatedDataStats != null && generatedDataStats.count() > 1) {
            // Just stores the path where this batch of data is generated to
            extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0));
        }
        HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext());
        HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema());
        writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata));
    }
}
Also used : Arrays(java.util.Arrays) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HashSet(java.util.HashSet) DagNode(org.apache.hudi.integ.testsuite.dag.nodes.DagNode) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestSuiteConfig(org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig) CompactHelpers(org.apache.hudi.table.action.compact.CompactHelpers) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) RollbackNode(org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) CleanNode(org.apache.hudi.integ.testsuite.dag.nodes.CleanNode) ScheduleCompactNode(org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) RDD(org.apache.spark.rdd.RDD) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HashMap(java.util.HashMap) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Aggregations

HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)139 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)64 ArrayList (java.util.ArrayList)54 HashMap (java.util.HashMap)49 List (java.util.List)48 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)44 IOException (java.io.IOException)42 Test (org.junit.jupiter.api.Test)41 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)40 Map (java.util.Map)38 Path (org.apache.hadoop.fs.Path)36 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)34 File (java.io.File)26 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 Option (org.apache.hudi.common.util.Option)25 Schema (org.apache.avro.Schema)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)21 Collectors (java.util.stream.Collectors)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20