use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class HoodieClusteringJob method getSchemaFromLatestInstant.
private String getSchemaFromLatestInstant() throws Exception {
TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
if (metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 0) {
throw new HoodieException("Cannot run clustering without any completed commits");
}
Schema schema = schemaResolver.getTableAvroSchema(false);
return schema.toString();
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class HoodieCompactor method getSchemaFromLatestInstant.
private String getSchemaFromLatestInstant() throws Exception {
TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
if (metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 0) {
throw new HoodieException("Cannot run compaction without any completed commits");
}
Schema schema = schemaUtil.getTableAvroSchema(false);
return schema.toString();
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class BootstrapOperator method loadRecords.
/**
* Loads all the indices of give partition path into the backup state.
*
* @param partitionPath The partition path
*/
@SuppressWarnings("unchecked")
protected void loadRecords(String partitionPath) throws Exception {
long start = System.currentTimeMillis();
final int parallelism = getRuntimeContext().getNumberOfParallelSubtasks();
final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks();
final int taskID = getRuntimeContext().getIndexOfThisSubtask();
HoodieTimeline commitsTimeline = this.hoodieTable.getMetaClient().getCommitsTimeline();
if (!StringUtils.isNullOrEmpty(lastInstantTime)) {
commitsTimeline = commitsTimeline.findInstantsAfter(lastInstantTime);
}
Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant();
if (latestCommitTime.isPresent()) {
BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
Schema schema = new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema();
List<FileSlice> fileSlices = this.hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp(), true).collect(toList());
for (FileSlice fileSlice : fileSlices) {
if (!shouldLoadFile(fileSlice.getFileId(), maxParallelism, parallelism, taskID)) {
continue;
}
LOG.info("Load records from {}.", fileSlice);
// load parquet records
fileSlice.getBaseFile().ifPresent(baseFile -> {
// filter out crushed files
if (!isValidFile(baseFile.getFileStatus())) {
return;
}
try (ClosableIterator<HoodieKey> iterator = fileUtils.getHoodieKeyIterator(this.hadoopConf, new Path(baseFile.getPath()))) {
iterator.forEachRemaining(hoodieKey -> {
output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice))));
});
}
});
// load avro log records
List<String> logPaths = fileSlice.getLogFiles().filter(logFile -> isValidFile(logFile.getFileStatus())).map(logFile -> logFile.getPath().toString()).collect(toList());
HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(), writeConfig, hadoopConf);
try {
for (String recordKey : scanner.getRecords().keySet()) {
output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice))));
}
} catch (Exception e) {
throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
} finally {
scanner.close();
}
}
}
long cost = System.currentTimeMillis() - start;
LOG.info("Task [{}}:{}}] finish loading the index under partition {} and sending them to downstream, time cost: {} milliseconds.", this.getClass().getSimpleName(), taskID, partitionPath, cost);
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class CompactionUtil method setAvroSchema.
/**
* Sets up the avro schema string into the HoodieWriteConfig {@code HoodieWriteConfig}
* through reading from the hoodie table metadata.
*
* @param writeConfig The HoodieWriteConfig
*/
public static void setAvroSchema(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) throws Exception {
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient);
Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchema(false);
writeConfig.setSchema(tableAvroSchema.toString());
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class CompactionUtil method inferChangelogMode.
/**
* Infers the changelog mode based on the data file schema(including metadata fields).
*
* <p>We can improve the code if the changelog mode is set up as table config.
*
* @param conf The configuration
*/
public static void inferChangelogMode(Configuration conf, HoodieTableMetaClient metaClient) throws Exception {
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient);
Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchemaFromDataFile();
if (tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null) {
conf.setBoolean(FlinkOptions.CHANGELOG_ENABLED, true);
}
}
Aggregations