Search in sources :

Example 11 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class UtilHelpers method createLatestSchemaProvider.

/**
 * Create latest schema provider for Target schema.
 *
 * @param structType spark data type of incoming batch.
 * @param jssc       instance of {@link JavaSparkContext}.
 * @param fs         instance of {@link FileSystem}.
 * @param basePath   base path of the table.
 * @return the schema provider where target schema refers to latest schema(either incoming schema or table schema).
 */
public static SchemaProvider createLatestSchemaProvider(StructType structType, JavaSparkContext jssc, FileSystem fs, String basePath) {
    SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(structType);
    Schema writeSchema = rowSchemaProvider.getTargetSchema();
    Schema latestTableSchema = writeSchema;
    try {
        if (FSUtils.isTableExists(basePath, fs)) {
            HoodieTableMetaClient tableMetaClient = HoodieTableMetaClient.builder().setConf(jssc.sc().hadoopConfiguration()).setBasePath(basePath).build();
            TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(tableMetaClient);
            latestTableSchema = tableSchemaResolver.getLatestSchema(writeSchema, true, (Function1<Schema, Schema>) v1 -> AvroConversionUtils.convertStructTypeToAvroSchema(AvroConversionUtils.convertAvroSchemaToStructType(v1), RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE));
        }
    } catch (IOException e) {
        LOG.warn("Could not fetch table schema. Falling back to writer schema");
    }
    final Schema finalLatestTableSchema = latestTableSchema;
    return new SchemaProvider(new TypedProperties()) {

        @Override
        public Schema getSourceSchema() {
            return rowSchemaProvider.getSourceSchema();
        }

        @Override
        public Schema getTargetSchema() {
            return finalLatestTableSchema;
        }
    };
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Schema(org.apache.avro.Schema) Function1(org.apache.hudi.common.util.Functions.Function1) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypedProperties(org.apache.hudi.common.config.TypedProperties)

Example 12 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class TableCommand method fetchTableSchema.

/**
 * Fetches table schema in avro format.
 */
@CliCommand(value = "fetch table schema", help = "Fetches latest table schema")
public String fetchTableSchema(@CliOption(key = { "outputFilePath" }, mandatory = false, help = "File path to write schema") final String outputFilePath) throws Exception {
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(client);
    Schema schema = tableSchemaResolver.getTableAvroSchema();
    if (outputFilePath != null) {
        LOG.info("Latest table schema : " + schema.toString(true));
        writeToFile(outputFilePath, schema.toString(true));
        return String.format("Latest table schema written to %s", outputFilePath);
    } else {
        return String.format("Latest table schema %s", schema.toString(true));
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 13 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class HoodieSparkCopyOnWriteTable method updateColumnsStatsIndex.

private void updateColumnsStatsIndex(@Nonnull HoodieEngineContext context, @Nonnull List<HoodieWriteStat> updatedFilesStats, @Nonnull String instantTime) throws Exception {
    String sortColsList = config.getClusteringSortColumns();
    String basePath = metaClient.getBasePath();
    String indexPath = metaClient.getColumnStatsIndexPath();
    List<String> touchedFiles = updatedFilesStats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList());
    if (touchedFiles.isEmpty() || StringUtils.isNullOrEmpty(sortColsList) || StringUtils.isNullOrEmpty(indexPath)) {
        return;
    }
    LOG.info(String.format("Updating column-statistics index table (%s)", indexPath));
    List<String> sortCols = Arrays.stream(sortColsList.split(",")).map(String::trim).collect(Collectors.toList());
    HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) context;
    // Fetch table schema to appropriately construct col-stats index schema
    Schema tableWriteSchema = HoodieAvroUtils.createHoodieWriteSchema(new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields());
    List<String> completedCommits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    ColumnStatsIndexHelper.updateColumnStatsIndexFor(sparkEngineContext.getSqlContext().sparkSession(), AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema), touchedFiles, sortCols, indexPath, instantTime, completedCommits);
    LOG.info(String.format("Successfully updated column-statistics index at instant (%s)", instantTime));
}
Also used : SparkDeletePartitionCommitActionExecutor(org.apache.hudi.table.action.commit.SparkDeletePartitionCommitActionExecutor) Arrays(java.util.Arrays) SparkExecuteClusteringCommitActionExecutor(org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor) HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SavepointActionExecutor(org.apache.hudi.table.action.savepoint.SavepointActionExecutor) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ColumnStatsIndexHelper(org.apache.hudi.index.columnstats.ColumnStatsIndexHelper) RestorePlanActionExecutor(org.apache.hudi.table.action.rollback.RestorePlanActionExecutor) Schema(org.apache.avro.Schema) SparkUpsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) CleanActionExecutor(org.apache.hudi.table.action.clean.CleanActionExecutor) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) BaseRollbackPlanActionExecutor(org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) CopyOnWriteRollbackActionExecutor(org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor) HoodieBootstrapWriteMetadata(org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieMergeHelper(org.apache.hudi.table.action.commit.HoodieMergeHelper) SparkBulkInsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) StringUtils(org.apache.hudi.common.util.StringUtils) SparkInsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) CleanPlanActionExecutor(org.apache.hudi.table.action.clean.CleanPlanActionExecutor) SparkUpsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor) SparkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertCommitActionExecutor) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ClusteringPlanActionExecutor(org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) SparkDeleteCommitActionExecutor(org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor) IOException(java.io.IOException) SparkBootstrapCommitActionExecutor(org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor) SparkInsertOverwriteCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor) CopyOnWriteRestoreActionExecutor(org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkInsertOverwriteTableCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver)

Example 14 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class TestStreamReadOperator method createReader.

private OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> createReader() throws Exception {
    final String basePath = tempFile.getAbsolutePath();
    final org.apache.hadoop.conf.Configuration hadoopConf = StreamerUtil.getHadoopConf();
    final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
    final List<String> partitionKeys = Collections.singletonList("partition");
    // This input format is used to opening the emitted split.
    TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
    final Schema tableAvroSchema;
    try {
        tableAvroSchema = schemaResolver.getTableAvroSchema();
    } catch (Exception e) {
        throw new HoodieException("Get table avro schema error", e);
    }
    final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
    final RowType rowType = (RowType) rowDataType.getLogicalType();
    final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState(rowType, TestConfigurations.ROW_TYPE, tableAvroSchema.toString(), AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), Collections.emptyList(), new String[0]);
    MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder().config(conf).tableState(hoodieTableState).fieldTypes(rowDataType.getChildren()).defaultPartName("default").limit(1000L).emitDelete(true).build();
    OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory(inputFormat);
    OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0);
    harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime);
    return harness;
}
Also used : Schema(org.apache.avro.Schema) RowType(org.apache.flink.table.types.logical.RowType) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) MergeOnReadTableState(org.apache.hudi.table.format.mor.MergeOnReadTableState) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) RowData(org.apache.flink.table.data.RowData) MergeOnReadInputFormat(org.apache.hudi.table.format.mor.MergeOnReadInputFormat) DataType(org.apache.flink.table.types.DataType)

Example 15 with TableSchemaResolver

use of org.apache.hudi.common.table.TableSchemaResolver in project hudi by apache.

the class HoodieCopyOnWriteTableInputFormat method getHoodieVirtualKeyInfo.

protected static Option<HoodieVirtualKeyInfo> getHoodieVirtualKeyInfo(HoodieTableMetaClient metaClient) {
    HoodieTableConfig tableConfig = metaClient.getTableConfig();
    if (tableConfig.populateMetaFields()) {
        return Option.empty();
    }
    TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient);
    try {
        Schema schema = tableSchemaResolver.getTableAvroSchema();
        return Option.of(new HoodieVirtualKeyInfo(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp(), schema.getField(tableConfig.getRecordKeyFieldProp()).pos(), schema.getField(tableConfig.getPartitionFieldProp()).pos()));
    } catch (Exception exception) {
        throw new HoodieException("Fetching table schema failed with exception ", exception);
    }
}
Also used : HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) HoodieVirtualKeyInfo(org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Aggregations

TableSchemaResolver (org.apache.hudi.common.table.TableSchemaResolver)15 Schema (org.apache.avro.Schema)14 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)6 HoodieException (org.apache.hudi.exception.HoodieException)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)4 HoodieIOException (org.apache.hudi.exception.HoodieIOException)4 Iterator (java.util.Iterator)3 List (java.util.List)3 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)3 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)3 Option (org.apache.hudi.common.util.Option)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 Collections (java.util.Collections)2 Collectors.toList (java.util.stream.Collectors.toList)2 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)2 HoodieCompactionPlan (org.apache.hudi.avro.model.HoodieCompactionPlan)2 WriteStatus (org.apache.hudi.client.WriteStatus)2 TypedProperties (org.apache.hudi.common.config.TypedProperties)2 HoodieData (org.apache.hudi.common.data.HoodieData)2