use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TableSchemaResolver method getTableParquetSchemaFromDataFile.
/**
* Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
* commit. We will assume that the schema has not changed within a single atomic write.
*
* @return Parquet schema for this table
*/
private MessageType getTableParquetSchemaFromDataFile() {
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData();
try {
switch(metaClient.getTableType()) {
case COPY_ON_WRITE:
// For COW table, the file has data written must be in parquet or orc format currently.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
return readSchemaFromBaseFile(filePath);
} else {
throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
}
case MERGE_ON_READ:
// Determine the file format based on the file name, and then extract schema from it.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
// this is a log file
return readSchemaFromLogFile(new Path(filePath));
} else {
return readSchemaFromBaseFile(filePath);
}
} else {
throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
}
default:
LOG.error("Unknown table type " + metaClient.getTableType());
throw new InvalidTableException(metaClient.getBasePath());
}
} catch (IOException e) {
throw new HoodieException("Failed to read data schema", e);
}
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TableSchemaResolver method getTableSchemaFromCommitMetadata.
/**
* Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the instant.
*
* @return Avro schema for this table
*/
private Option<Schema> getTableSchemaFromCommitMetadata(HoodieInstant instant, boolean includeMetadataFields) {
try {
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
byte[] data = timeline.getInstantDetails(instant).get();
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
if (StringUtils.isNullOrEmpty(existingSchemaStr)) {
return Option.empty();
}
Schema schema = new Schema.Parser().parse(existingSchemaStr);
if (includeMetadataFields) {
schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField);
}
return Option.of(schema);
} catch (Exception e) {
throw new HoodieException("Failed to read schema from commit metadata", e);
}
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TimelineUtils method getMetadataValue.
private static Option<String> getMetadataValue(HoodieTableMetaClient metaClient, String extraMetadataKey, HoodieInstant instant) {
try {
LOG.info("reading checkpoint info for:" + instant + " key: " + extraMetadataKey);
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(metaClient.getCommitsTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class);
return Option.ofNullable(commitMetadata.getExtraMetadata().get(extraMetadataKey));
} catch (IOException e) {
throw new HoodieIOException("Unable to parse instant metadata " + instant, e);
}
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class HoodieTestSuiteJob method getSchemaVersionFromCommit.
int getSchemaVersionFromCommit(int nthCommit) throws Exception {
int version = 0;
try {
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitsTimeline();
// Pickup the schema version from nth commit from last (most recent insert/upsert will be rolled back).
HoodieInstant prevInstant = timeline.nthFromLastInstant(nthCommit).get();
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(prevInstant).get(), HoodieCommitMetadata.class);
Map<String, String> extraMetadata = commit.getExtraMetadata();
String avroSchemaStr = extraMetadata.get(HoodieCommitMetadata.SCHEMA_KEY);
Schema avroSchema = new Schema.Parser().parse(avroSchemaStr);
version = Integer.parseInt(avroSchema.getObjectProp("schemaVersion").toString());
// DAG will generate & ingest data for 2 versions (n-th version being validated, n-1).
log.info(String.format("Last used schemaVersion from latest commit file was %d. Optimizing the DAG.", version));
} catch (Exception e) {
// failed to open the commit to read schema version.
// continue executing the DAG without any changes.
log.info("Last used schemaVersion could not be validated from commit file. Skipping SaferSchema Optimization.");
}
return version;
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class HoodieTestSuiteWriter method commitCompaction.
public void commitCompaction(JavaRDD<WriteStatus> records, JavaRDD<DeltaWriteStats> generatedDataStats, Option<String> instantTime) throws IOException {
if (!cfg.useDeltaStreamer) {
Map<String, String> extraMetadata = new HashMap<>();
/**
* Store the checkpoint in the commit metadata just like
* {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} *
*/
extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
if (generatedDataStats != null && generatedDataStats.count() > 1) {
// Just stores the path where this batch of data is generated to
extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0));
}
HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext());
HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema());
writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata));
}
}
Aggregations