Search in sources :

Example 1 with KafkaUtils

use of org.apache.spark.streaming.kafka010.KafkaUtils in project hudi by apache.

the class DebeziumSource method toDataset.

/**
 * Converts a Kafka Topic offset into a Spark dataset.
 *
 * @param offsetRanges Offset ranges
 * @param offsetGen    KafkaOffsetGen
 * @return Spark dataset
 */
private Dataset<Row> toDataset(OffsetRange[] offsetRanges, KafkaOffsetGen offsetGen, String schemaStr) {
    AvroConvertor convertor = new AvroConvertor(schemaStr);
    Dataset<Row> kafkaData;
    if (deserializerClassName.equals(StringDeserializer.class.getName())) {
        kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.<String, String>createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> convertor.fromJson(obj.value())).rdd(), schemaStr, sparkSession);
    } else {
        kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> (GenericRecord) obj.value()).rdd(), schemaStr, sparkSession);
    }
    // Flatten debezium payload, specific to each DB type (postgres/ mysql/ etc..)
    Dataset<Row> debeziumDataset = processDataset(kafkaData);
    // Some required transformations to ensure debezium data types are converted to spark supported types.
    return convertArrayColumnsToString(convertColumnToNullable(sparkSession, convertDateColumns(debeziumDataset, new Schema.Parser().parse(schemaStr))));
}
Also used : Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) SchemaRegistryProvider(org.apache.hudi.utilities.schema.SchemaRegistryProvider) HoodieException(org.apache.hudi.exception.HoodieException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) KafkaOffsetGen(org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) RowSource(org.apache.hudi.utilities.sources.RowSource) Logger(org.apache.log4j.Logger) LocationStrategies(org.apache.spark.streaming.kafka010.LocationStrategies) AvroConvertor(org.apache.hudi.utilities.sources.helpers.AvroConvertor) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) CheckpointUtils(org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils) Type(org.apache.avro.Schema.Type) SparkSession(org.apache.spark.sql.SparkSession) KafkaUtils(org.apache.spark.streaming.kafka010.KafkaUtils) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) Field(org.apache.avro.Schema.Field) TypedProperties(org.apache.hudi.common.config.TypedProperties) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) List(java.util.List) LogManager(org.apache.log4j.LogManager) org.apache.spark.sql.functions(org.apache.spark.sql.functions) Pair(org.apache.hudi.common.util.collection.Pair) AvroConvertor(org.apache.hudi.utilities.sources.helpers.AvroConvertor) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) Row(org.apache.spark.sql.Row) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

IOException (java.io.IOException)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 Schema (org.apache.avro.Schema)1 Field (org.apache.avro.Schema.Field)1 Type (org.apache.avro.Schema.Type)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 AvroConversionUtils (org.apache.hudi.AvroConversionUtils)1 DataSourceWriteOptions (org.apache.hudi.DataSourceWriteOptions)1 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 Option (org.apache.hudi.common.util.Option)1 Pair (org.apache.hudi.common.util.collection.Pair)1 HoodieException (org.apache.hudi.exception.HoodieException)1 HoodieDeltaStreamerMetrics (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics)1 SchemaProvider (org.apache.hudi.utilities.schema.SchemaProvider)1 SchemaRegistryProvider (org.apache.hudi.utilities.schema.SchemaRegistryProvider)1 RowSource (org.apache.hudi.utilities.sources.RowSource)1 AvroConvertor (org.apache.hudi.utilities.sources.helpers.AvroConvertor)1 KafkaOffsetGen (org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen)1