Search in sources :

Example 1 with DelegatingSchemaProvider

use of org.apache.hudi.utilities.schema.DelegatingSchemaProvider in project hudi by apache.

the class DeltaSync method fetchFromSource.

private Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchFromSource(Option<String> resumeCheckpointStr) {
    final Option<JavaRDD<GenericRecord>> avroRDDOptional;
    final String checkpointStr;
    SchemaProvider schemaProvider;
    if (transformer.isPresent()) {
        // Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
        // to generic records for writing
        InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit);
        Option<Dataset<Row>> transformed = dataAndCheckpoint.getBatch().map(data -> transformer.get().apply(jssc, sparkSession, data, props));
        checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
        boolean reconcileSchema = props.getBoolean(DataSourceWriteOptions.RECONCILE_SCHEMA().key());
        if (this.userProvidedSchemaProvider != null && this.userProvidedSchemaProvider.getTargetSchema() != null) {
            // If the target schema is specified through Avro schema,
            // pass in the schema for the Row-to-Avro conversion
            // to avoid nullability mismatch between Avro schema and Row schema
            avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.of(this.userProvidedSchemaProvider.getTargetSchema())).toJavaRDD());
            schemaProvider = this.userProvidedSchemaProvider;
        } else {
            // Use Transformed Row's schema if not overridden. If target schema is not specified
            // default to RowBasedSchemaProvider
            schemaProvider = transformed.map(r -> {
                // determine the targetSchemaProvider. use latestTableSchema if reconcileSchema is enabled.
                SchemaProvider targetSchemaProvider = null;
                if (reconcileSchema) {
                    targetSchemaProvider = UtilHelpers.createLatestSchemaProvider(r.schema(), jssc, fs, cfg.targetBasePath);
                } else {
                    targetSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc);
                }
                return (SchemaProvider) new DelegatingSchemaProvider(props, jssc, dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider);
            }).orElse(dataAndCheckpoint.getSchemaProvider());
            avroRDDOptional = transformed.map(t -> HoodieSparkUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE, reconcileSchema, Option.ofNullable(schemaProvider.getTargetSchema())).toJavaRDD());
        }
    } else {
        // Pull the data from the source & prepare the write
        InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint = formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
        avroRDDOptional = dataAndCheckpoint.getBatch();
        checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
        schemaProvider = dataAndCheckpoint.getSchemaProvider();
    }
    if (!cfg.allowCommitOnNoCheckpointChange && Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) {
        LOG.info("No new data, source checkpoint has not changed. Nothing to commit. Old checkpoint=(" + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")");
        String commitActionType = CommitUtils.getCommitActionType(cfg.operation, HoodieTableType.valueOf(cfg.tableType));
        hoodieMetrics.updateMetricsForEmptyData(commitActionType);
        return null;
    }
    jssc.setJobGroup(this.getClass().getSimpleName(), "Checking if input is empty");
    if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
        LOG.info("No new data, perform empty commit.");
        return Pair.of(schemaProvider, Pair.of(checkpointStr, jssc.emptyRDD()));
    }
    boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT);
    JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
    JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
        HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(cfg.payloadClassName, gr, (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())))) : DataSourceUtils.createPayload(cfg.payloadClassName, gr);
        return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload);
    });
    return Pair.of(schemaProvider, Pair.of(checkpointStr, records));
}
Also used : ARCHIVELOG_FOLDER(org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) INLINE_COMPACT(org.apache.hudi.config.HoodieCompactionConfig.INLINE_COMPACT) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) KafkaOffsetGen(org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen) EmbeddedTimelineServerHelper(org.apache.hudi.client.embedded.EmbeddedTimelineServerHelper) COMBINE_BEFORE_UPSERT(org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_UPSERT) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) HOODIE_RECORD_NAMESPACE(org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE) HiveSyncTool(org.apache.hudi.hive.HiveSyncTool) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieSparkUtils(org.apache.hudi.HoodieSparkUtils) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Configuration(org.apache.hadoop.conf.Configuration) Transformer(org.apache.hudi.utilities.transform.Transformer) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) HOODIE_RECORD_STRUCT_NAME(org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) HoodieMetrics(org.apache.hudi.metrics.HoodieMetrics) Schema(org.apache.avro.Schema) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) KeyGeneratorOptions(org.apache.hudi.keygen.constant.KeyGeneratorOptions) Set(java.util.Set) HoodieWriteCommitPulsarCallbackConfig(org.apache.hudi.utilities.callback.pulsar.HoodieWriteCommitPulsarCallbackConfig) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) InputBatch(org.apache.hudi.utilities.sources.InputBatch) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) Objects(java.util.Objects) List(java.util.List) EmbeddedTimelineService(org.apache.hudi.client.embedded.EmbeddedTimelineService) INLINE_CLUSTERING(org.apache.hudi.config.HoodieClusteringConfig.INLINE_CLUSTERING) Timer(com.codahale.metrics.Timer) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) CHECKPOINT_RESET_KEY(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_RESET_KEY) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AbstractSyncTool(org.apache.hudi.sync.common.AbstractSyncTool) CHECKPOINT_KEY(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataSourceUtils(org.apache.hudi.DataSourceUtils) CommitUtils(org.apache.hudi.common.util.CommitUtils) Function(java.util.function.Function) SchemaSet(org.apache.hudi.utilities.schema.SchemaSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AUTO_COMMIT_ENABLE(org.apache.hudi.config.HoodieWriteConfig.AUTO_COMMIT_ENABLE) StringUtils(org.apache.hudi.common.util.StringUtils) KeyGenerator(org.apache.hudi.keygen.KeyGenerator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Config(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config) JavaConversions(scala.collection.JavaConversions) GenericRecord(org.apache.avro.generic.GenericRecord) Properties(java.util.Properties) ASYNC_CLUSTERING_ENABLE(org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieWriteCommitPulsarCallback(org.apache.hudi.utilities.callback.pulsar.HoodieWriteCommitPulsarCallback) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieWriteCommitKafkaCallbackConfig(org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallbackConfig) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieWriteCommitKafkaCallback(org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback) COMBINE_BEFORE_INSERT(org.apache.hudi.config.HoodieWriteConfig.COMBINE_BEFORE_INSERT) Dataset(org.apache.spark.sql.Dataset) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

Timer (com.codahale.metrics.Timer)1 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Objects (java.util.Objects)1 Properties (java.util.Properties)1 Set (java.util.Set)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 Schema (org.apache.avro.Schema)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1