use of org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen in project hudi by apache.
the class DebeziumSource method toDataset.
/**
* Converts a Kafka Topic offset into a Spark dataset.
*
* @param offsetRanges Offset ranges
* @param offsetGen KafkaOffsetGen
* @return Spark dataset
*/
private Dataset<Row> toDataset(OffsetRange[] offsetRanges, KafkaOffsetGen offsetGen, String schemaStr) {
AvroConvertor convertor = new AvroConvertor(schemaStr);
Dataset<Row> kafkaData;
if (deserializerClassName.equals(StringDeserializer.class.getName())) {
kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.<String, String>createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> convertor.fromJson(obj.value())).rdd(), schemaStr, sparkSession);
} else {
kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> (GenericRecord) obj.value()).rdd(), schemaStr, sparkSession);
}
// Flatten debezium payload, specific to each DB type (postgres/ mysql/ etc..)
Dataset<Row> debeziumDataset = processDataset(kafkaData);
// Some required transformations to ensure debezium data types are converted to spark supported types.
return convertArrayColumnsToString(convertColumnToNullable(sparkSession, convertDateColumns(debeziumDataset, new Schema.Parser().parse(schemaStr))));
}
Aggregations