use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class UtilHelpers method createLatestSchemaProvider.
/**
* Create latest schema provider for Target schema.
*
* @param structType spark data type of incoming batch.
* @param jssc instance of {@link JavaSparkContext}.
* @param fs instance of {@link FileSystem}.
* @param basePath base path of the table.
* @return the schema provider where target schema refers to latest schema(either incoming schema or table schema).
*/
public static SchemaProvider createLatestSchemaProvider(StructType structType, JavaSparkContext jssc, FileSystem fs, String basePath) {
SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(structType);
Schema writeSchema = rowSchemaProvider.getTargetSchema();
Schema latestTableSchema = writeSchema;
try {
if (FSUtils.isTableExists(basePath, fs)) {
HoodieTableMetaClient tableMetaClient = HoodieTableMetaClient.builder().setConf(jssc.sc().hadoopConfiguration()).setBasePath(basePath).build();
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(tableMetaClient);
latestTableSchema = tableSchemaResolver.getLatestSchema(writeSchema, true, (Function1<Schema, Schema>) v1 -> AvroConversionUtils.convertStructTypeToAvroSchema(AvroConversionUtils.convertAvroSchemaToStructType(v1), RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE));
}
} catch (IOException e) {
LOG.warn("Could not fetch table schema. Falling back to writer schema");
}
final Schema finalLatestTableSchema = latestTableSchema;
return new SchemaProvider(new TypedProperties()) {
@Override
public Schema getSourceSchema() {
return rowSchemaProvider.getSourceSchema();
}
@Override
public Schema getTargetSchema() {
return finalLatestTableSchema;
}
};
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class TableCommand method fetchTableSchema.
/**
* Fetches table schema in avro format.
*/
@CliCommand(value = "fetch table schema", help = "Fetches latest table schema")
public String fetchTableSchema(@CliOption(key = { "outputFilePath" }, mandatory = false, help = "File path to write schema") final String outputFilePath) throws Exception {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(client);
Schema schema = tableSchemaResolver.getTableAvroSchema();
if (outputFilePath != null) {
LOG.info("Latest table schema : " + schema.toString(true));
writeToFile(outputFilePath, schema.toString(true));
return String.format("Latest table schema written to %s", outputFilePath);
} else {
return String.format("Latest table schema %s", schema.toString(true));
}
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class HoodieSparkCopyOnWriteTable method updateColumnsStatsIndex.
private void updateColumnsStatsIndex(@Nonnull HoodieEngineContext context, @Nonnull List<HoodieWriteStat> updatedFilesStats, @Nonnull String instantTime) throws Exception {
String sortColsList = config.getClusteringSortColumns();
String basePath = metaClient.getBasePath();
String indexPath = metaClient.getColumnStatsIndexPath();
List<String> touchedFiles = updatedFilesStats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList());
if (touchedFiles.isEmpty() || StringUtils.isNullOrEmpty(sortColsList) || StringUtils.isNullOrEmpty(indexPath)) {
return;
}
LOG.info(String.format("Updating column-statistics index table (%s)", indexPath));
List<String> sortCols = Arrays.stream(sortColsList.split(",")).map(String::trim).collect(Collectors.toList());
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) context;
// Fetch table schema to appropriately construct col-stats index schema
Schema tableWriteSchema = HoodieAvroUtils.createHoodieWriteSchema(new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields());
List<String> completedCommits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
ColumnStatsIndexHelper.updateColumnStatsIndexFor(sparkEngineContext.getSqlContext().sparkSession(), AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema), touchedFiles, sortCols, indexPath, instantTime, completedCommits);
LOG.info(String.format("Successfully updated column-statistics index at instant (%s)", instantTime));
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class TestStreamReadOperator method createReader.
private OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> createReader() throws Exception {
final String basePath = tempFile.getAbsolutePath();
final org.apache.hadoop.conf.Configuration hadoopConf = StreamerUtil.getHadoopConf();
final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
final List<String> partitionKeys = Collections.singletonList("partition");
// This input format is used to opening the emitted split.
TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
final Schema tableAvroSchema;
try {
tableAvroSchema = schemaResolver.getTableAvroSchema();
} catch (Exception e) {
throw new HoodieException("Get table avro schema error", e);
}
final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
final RowType rowType = (RowType) rowDataType.getLogicalType();
final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState(rowType, TestConfigurations.ROW_TYPE, tableAvroSchema.toString(), AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), Collections.emptyList(), new String[0]);
MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder().config(conf).tableState(hoodieTableState).fieldTypes(rowDataType.getChildren()).defaultPartName("default").limit(1000L).emitDelete(true).build();
OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory(inputFormat);
OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0);
harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime);
return harness;
}
use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.
the class HoodieCopyOnWriteTableInputFormat method getHoodieVirtualKeyInfo.
protected static Option<HoodieVirtualKeyInfo> getHoodieVirtualKeyInfo(HoodieTableMetaClient metaClient) {
HoodieTableConfig tableConfig = metaClient.getTableConfig();
if (tableConfig.populateMetaFields()) {
return Option.empty();
}
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient);
try {
Schema schema = tableSchemaResolver.getTableAvroSchema();
return Option.of(new HoodieVirtualKeyInfo(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp(), schema.getField(tableConfig.getRecordKeyFieldProp()).pos(), schema.getField(tableConfig.getPartitionFieldProp()).pos()));
} catch (Exception exception) {
throw new HoodieException("Fetching table schema failed with exception ", exception);
}
}
Aggregations