Search in sources :

Example 6 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class TestInputBatch method getSchemaProviderShouldReturnGivenSchemaProvider.

@Test
public void getSchemaProviderShouldReturnGivenSchemaProvider() {
    SchemaProvider schemaProvider = new RowBasedSchemaProvider(null);
    final InputBatch<String> inputBatch = new InputBatch<>(Option.of("foo"), null, schemaProvider);
    assertSame(schemaProvider, inputBatch.getSchemaProvider());
}
Also used : SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) Test(org.junit.jupiter.api.Test)

Example 7 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class TestInputBatch method getSchemaProviderShouldReturnNullSchemaProvider.

@Test
public void getSchemaProviderShouldReturnNullSchemaProvider() {
    final InputBatch<String> inputBatch = new InputBatch<>(Option.empty(), null, null);
    SchemaProvider schemaProvider = inputBatch.getSchemaProvider();
    assertTrue(schemaProvider instanceof InputBatch.NullSchemaProvider);
}
Also used : SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) Test(org.junit.jupiter.api.Test)

Example 8 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class UtilHelpers method createLatestSchemaProvider.

/**
 * Create latest schema provider for Target schema.
 *
 * @param structType spark data type of incoming batch.
 * @param jssc       instance of {@link JavaSparkContext}.
 * @param fs         instance of {@link FileSystem}.
 * @param basePath   base path of the table.
 * @return the schema provider where target schema refers to latest schema(either incoming schema or table schema).
 */
public static SchemaProvider createLatestSchemaProvider(StructType structType, JavaSparkContext jssc, FileSystem fs, String basePath) {
    SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(structType);
    Schema writeSchema = rowSchemaProvider.getTargetSchema();
    Schema latestTableSchema = writeSchema;
    try {
        if (FSUtils.isTableExists(basePath, fs)) {
            HoodieTableMetaClient tableMetaClient = HoodieTableMetaClient.builder().setConf(jssc.sc().hadoopConfiguration()).setBasePath(basePath).build();
            TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(tableMetaClient);
            latestTableSchema = tableSchemaResolver.getLatestSchema(writeSchema, true, (Function1<Schema, Schema>) v1 -> AvroConversionUtils.convertStructTypeToAvroSchema(AvroConversionUtils.convertAvroSchemaToStructType(v1), RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE));
        }
    } catch (IOException e) {
        LOG.warn("Could not fetch table schema. Falling back to writer schema");
    }
    final Schema finalLatestTableSchema = latestTableSchema;
    return new SchemaProvider(new TypedProperties()) {

        @Override
        public Schema getSourceSchema() {
            return rowSchemaProvider.getSourceSchema();
        }

        @Override
        public Schema getTargetSchema() {
            return finalLatestTableSchema;
        }
    };
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Schema(org.apache.avro.Schema) Function1(org.apache.hudi.common.util.Functions.Function1) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypedProperties(org.apache.hudi.common.config.TypedProperties)

Example 9 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class DeltaSync method fetchFromSource.

private Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchFromSource(Option<String> resumeCheckpointStr) {
    final Option<JavaRDD<GenericRecord>> avroRDDOptional;
    final String checkpointStr;
    SchemaProvider schemaProvider;
    if (transformer.isPresent()) {
        // Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
        // to generic records for writing
        InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit);
        Option<Dataset<Row>> transformed = dataAndCheckpoint.getBatch().map(data -> transformer.get().apply(jssc, sparkSession, data, props));
        checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
        boolean reconcileSchema = props.getBoolean(DataSourceWriteOptions.RECONCILE_SCHEMA().key());
        if (this.userProvidedSchemaProvider != null && this.userProvidedSchemaProvider.getTargetSchema() != null) {
            // If the target schema is specified through Avro schema,
            // pass in the schema for the Row-to-Avro conversion
            // to avoid nullability mismatch between Avro schema and Row schema
            avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.of(this.userProvidedSchemaProvider.getTargetSchema())).toJavaRDD());
            schemaProvider = this.userProvidedSchemaProvider;
        } else {
            // Use Transformed Row's schema if not overridden. If target schema is not specified
            // default to RowBasedSchemaProvider
            schemaProvider = transformed.map(r -> {
                // determine the targetSchemaProvider. use latestTableSchema if reconcileSchema is enabled.
                SchemaProvider targetSchemaProvider = null;
                if (reconcileSchema) {
                    targetSchemaProvider = UtilHelpers.createLatestSchemaProvider(r.schema(), jssc, fs, cfg.targetBasePath);
                } else {
                    targetSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc);
                }
                return (SchemaProvider) new DelegatingSchemaProvider(props, jssc, dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider);
            }).orElse(dataAndCheckpoint.getSchemaProvider());
            avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.ofNullable(schemaProvider.getTargetSchema())).toJavaRDD());
        }
    } else {
        // Pull the data from the source & prepare the write
        InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint = formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
        avroRDDOptional = dataAndCheckpoint.getBatch();
        checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
        schemaProvider = dataAndCheckpoint.getSchemaProvider();
    }
    if (!cfg.allowCommitOnNoCheckpointChange && Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) {
        LOG.info("No new data, source checkpoint has not changed. Nothing to commit. Old checkpoint=(" + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")");
        String commitActionType = CommitUtils.getCommitActionType(cfg.operation, HoodieTableType.valueOf(cfg.tableType));
        hoodieMetrics.updateMetricsForEmptyData(commitActionType);
        return null;
    }
    jssc.setJobGroup(this.getClass().getSimpleName(), "Checking if input is empty");
    if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
        LOG.info("No new data, perform empty commit.");
        return Pair.of(schemaProvider, Pair.of(checkpointStr, jssc.emptyRDD()));
    }
    boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT);
    JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
    JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
        HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(cfg.payloadClassName, gr, (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())))) : DataSourceUtils.createPayload(cfg.payloadClassName, gr);
        return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload);
    });
    return Pair.of(schemaProvider, Pair.of(checkpointStr, records));
}
Also used : ARCHIVELOG_FOLDER(org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) INLINE_COMPACT(org.apache.hudi.config.HoodieCompactionConfig.INLINE_COMPACT) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) KafkaOffsetGen(org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen) EmbeddedTimelineServerHelper(org.apache.hudi.client.embedded.EmbeddedTimelineServerHelper) COMBINE_BEFORE_UPSERT(org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_UPSERT) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) HOODIE_RECORD_NAMESPACE(org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE) HiveSyncTool(org.apache.hudi.hive.HiveSyncTool) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieSparkUtils(org.apache.hudi.HoodieSparkUtils) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Configuration(org.apache.hadoop.conf.Configuration) Transformer(org.apache.hudi.utilities.transform.Transformer) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) HOODIE_RECORD_STRUCT_NAME(org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) HoodieMetrics(org.apache.hudi.metrics.HoodieMetrics) Schema(org.apache.avro.Schema) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) Set(java.util.Set) HoodieWriteCommitPulsarCallbackConfig(org.apache.hudi.utilities.callback.pulsar.HoodieWriteCommitPulsarCallbackConfig) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) InputBatch(org.apache.hudi.utilities.sources.InputBatch) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) Objects(java.util.Objects) List(java.util.List) EmbeddedTimelineService(org.apache.hudi.client.embedded.EmbeddedTimelineService) INLINE_CLUSTERING(org.apache.hudi.config.HoodieClusteringConfig.INLINE_CLUSTERING) Timer(com.codahale.metrics.Timer) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) CHECKPOINT_RESET_KEY(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_RESET_KEY) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AbstractSyncTool(org.apache.hudi.sync.common.AbstractSyncTool) CHECKPOINT_KEY(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataSourceUtils(org.apache.hudi.DataSourceUtils) CommitUtils(org.apache.hudi.common.util.CommitUtils) Function(java.util.function.Function) SchemaSet(org.apache.hudi.utilities.schema.SchemaSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AUTO_COMMIT_ENABLE(org.apache.hudi.config.HoodieWriteConfig.AUTO_COMMIT_ENABLE) StringUtils(org.apache.hudi.common.util.StringUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Config(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config) JavaConversions(scala.collection.JavaConversions) GenericRecord(org.apache.avro.generic.GenericRecord) Properties(java.util.Properties) ASYNC_CLUSTERING_ENABLE(org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieWriteCommitPulsarCallback(org.apache.hudi.utilities.callback.pulsar.HoodieWriteCommitPulsarCallback) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieWriteCommitKafkaCallbackConfig(org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallbackConfig) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieWriteCommitKafkaCallback(org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback) COMBINE_BEFORE_INSERT(org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_INSERT) Dataset(org.apache.spark.sql.Dataset) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

SchemaProvider (org.apache.hudi.utilities.schema.SchemaProvider)9 Schema (org.apache.avro.Schema)6 DelegatingSchemaProvider (org.apache.hudi.utilities.schema.DelegatingSchemaProvider)4 Test (org.junit.jupiter.api.Test)4 IOException (java.io.IOException)3 TypedProperties (org.apache.hudi.common.config.TypedProperties)3 Option (org.apache.hudi.common.util.Option)3 RowBasedSchemaProvider (org.apache.hudi.utilities.schema.RowBasedSchemaProvider)3 Timer (com.codahale.metrics.Timer)2 ArrayList (java.util.ArrayList)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Configuration (org.apache.hadoop.conf.Configuration)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HoodieIOException (org.apache.hudi.exception.HoodieIOException)2 UtilHelpers (org.apache.hudi.utilities.UtilHelpers)2 InputBatch (org.apache.hudi.utilities.sources.InputBatch)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 Dataset (org.apache.spark.sql.Dataset)2