use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class HoodieCompactor method compact.
/**
* Execute compaction operations and report back status.
*/
public HoodieData<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable table, HoodieWriteConfig config, String compactionInstantTime, HoodieCompactionHandler compactionHandler) {
if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) {
return context.emptyHoodieData();
}
HoodieActiveTimeline timeline = table.getActiveTimeline();
HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
// Mark instant as compaction inflight
timeline.transitionCompactionRequestedToInflight(instant);
table.getMetaClient().reloadActiveTimeline();
HoodieTableMetaClient metaClient = table.getMetaClient();
TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
// the same with the table schema.
try {
Schema readerSchema = schemaResolver.getTableAvroSchema(false);
config.setSchema(readerSchema.toString());
} catch (Exception e) {
// If there is no commit in the table, just ignore the exception.
}
// Compacting is very similar to applying updates to existing file
List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
LOG.info("Compactor compacting " + operations + " files");
context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier();
return context.parallelize(operations).map(operation -> compact(compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier)).flatMap(List::iterator);
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TableSchemaResolver method readSchemaFromLastCompaction.
/**
* Read schema from a data file from the last compaction commit done.
* @throws Exception
*/
public MessageType readSchemaFromLastCompaction(Option<HoodieInstant> lastCompactionCommitOpt) throws Exception {
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow(() -> new Exception("Could not read schema from last compaction, no compaction commits found on path " + metaClient));
// Read from the compacted file wrote
HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction " + lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
return readSchemaFromBaseFile(filePath);
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TableSchemaResolver method getTableParquetSchemaFromDataFile.
/**
* Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
* commit. We will assume that the schema has not changed within a single atomic write.
*
* @return Parquet schema for this table
*/
private MessageType getTableParquetSchemaFromDataFile() {
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData();
try {
switch(metaClient.getTableType()) {
case COPY_ON_WRITE:
// For COW table, the file has data written must be in parquet or orc format currently.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
return readSchemaFromBaseFile(filePath);
} else {
throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
}
case MERGE_ON_READ:
// Determine the file format based on the file name, and then extract schema from it.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
// this is a log file
return readSchemaFromLogFile(new Path(filePath));
} else {
return readSchemaFromBaseFile(filePath);
}
} else {
throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
}
default:
LOG.error("Unknown table type " + metaClient.getTableType());
throw new InvalidTableException(metaClient.getBasePath());
}
} catch (IOException e) {
throw new HoodieException("Failed to read data schema", e);
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class HoodieTestSuiteJob method getSchemaVersionFromCommit.
int getSchemaVersionFromCommit(int nthCommit) throws Exception {
int version = 0;
try {
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitsTimeline();
// Pickup the schema version from nth commit from last (most recent insert/upsert will be rolled back).
HoodieInstant prevInstant = timeline.nthFromLastInstant(nthCommit).get();
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(prevInstant).get(), HoodieCommitMetadata.class);
Map<String, String> extraMetadata = commit.getExtraMetadata();
String avroSchemaStr = extraMetadata.get(HoodieCommitMetadata.SCHEMA_KEY);
Schema avroSchema = new Schema.Parser().parse(avroSchemaStr);
version = Integer.parseInt(avroSchema.getObjectProp("schemaVersion").toString());
// DAG will generate & ingest data for 2 versions (n-th version being validated, n-1).
log.info(String.format("Last used schemaVersion from latest commit file was %d. Optimizing the DAG.", version));
} catch (Exception e) {
// failed to open the commit to read schema version.
// continue executing the DAG without any changes.
log.info("Last used schemaVersion could not be validated from commit file. Skipping SaferSchema Optimization.");
}
return version;
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class BaseHoodieWriteClient method commit.
protected void commit(HoodieTable table, String commitActionType, String instantTime, HoodieCommitMetadata metadata, List<HoodieWriteStat> stats) throws IOException {
LOG.info("Committing " + instantTime + " action " + commitActionType);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
// Finalize write
finalizeWrite(table, instantTime, stats);
// update Metadata table
writeTableMetadata(table, instantTime, commitActionType, metadata);
activeTimeline.saveAsComplete(new HoodieInstant(true, commitActionType, instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
}
Aggregations