Search in sources :

Example 11 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project lambda-arch by apssouza22.

the class TrafficOffsetCommitCallback method commitOffset.

/**
 * Commit the ack to kafka after process have completed
 *
 * @param directKafkaStream
 */
private void commitOffset(JavaInputDStream<ConsumerRecord<String, IoTData>> directKafkaStream) {
    directKafkaStream.foreachRDD((JavaRDD<ConsumerRecord<String, IoTData>> trafficRdd) -> {
        if (!trafficRdd.isEmpty()) {
            OffsetRange[] offsetRanges = ((HasOffsetRanges) trafficRdd.rdd()).offsetRanges();
            CanCommitOffsets canCommitOffsets = (CanCommitOffsets) directKafkaStream.inputDStream();
            canCommitOffsets.commitAsync(offsetRanges, new TrafficOffsetCommitCallback());
        }
    });
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) CanCommitOffsets(org.apache.spark.streaming.kafka010.CanCommitOffsets) IoTData(com.apssouza.iot.dto.IoTData) HasOffsetRanges(org.apache.spark.streaming.kafka010.HasOffsetRanges) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 12 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project lambda-arch by apssouza22.

the class TrafficOffsetCommitCallback method commitOffset.

/**
 * Commit the ack to kafka after process have completed
 * This is our fault-tolerance implementation
 *
 * @param directKafkaStream
 */
private void commitOffset(JavaInputDStream<ConsumerRecord<String, IoTData>> directKafkaStream) {
    directKafkaStream.foreachRDD((JavaRDD<ConsumerRecord<String, IoTData>> trafficRdd) -> {
        if (!trafficRdd.isEmpty()) {
            OffsetRange[] offsetRanges = ((HasOffsetRanges) trafficRdd.rdd()).offsetRanges();
            CanCommitOffsets canCommitOffsets = (CanCommitOffsets) directKafkaStream.inputDStream();
            canCommitOffsets.commitAsync(offsetRanges, new TrafficOffsetCommitCallback());
        }
    });
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) CanCommitOffsets(org.apache.spark.streaming.kafka010.CanCommitOffsets) IoTData(com.apssouza.iot.common.dto.IoTData) HasOffsetRanges(org.apache.spark.streaming.kafka010.HasOffsetRanges) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 13 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project beijingThirdPeriod by weidongcao.

the class SparkStreamingKafkaDemo method main.

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("demo").setMaster("local[4]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(8));
    Map<String, Object> kafkaParams = new HashMap<>();
    // Kafka服务监听端口
    kafkaParams.put("bootstrap.servers", "cm02.spark.com:9092");
    // 指定kafka输出key的数据类型及编码格式(默认为字符串类型编码格式为uft-8)
    kafkaParams.put("key.deserializer", StringDeserializer.class);
    // 指定kafka输出value的数据类型及编码格式(默认为字符串类型编码格式为uft-8)
    kafkaParams.put("value.deserializer", StringDeserializer.class);
    // 消费者ID,随意指定
    kafkaParams.put("group.id", "jis");
    // 指定从latest(最新,其他版本的是largest这里不行)还是smallest(最早)处开始读取数据
    kafkaParams.put("auto.offset.reset", "latest");
    // 如果true,consumer定期地往zookeeper写入每个分区的offset
    kafkaParams.put("enable.auto.commit", false);
    // 要监听的Topic
    Collection<String> topics = Arrays.asList("topicA", "topicB");
    // 指定偏移量
    OffsetRange[] offsetRanges = { OffsetRange.create("test", 0, 0, 3), OffsetRange.create("test", 1, 0, 3) };
    final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
    JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(streamingContext.sparkContext(), kafkaParams, offsetRanges, LocationStrategies.PreferConsistent());
    stream.foreachRDD((VoidFunction<JavaRDD<ConsumerRecord<String, String>>>) rdd1 -> {
        final OffsetRange[] offsetRanges1 = ((HasOffsetRanges) (rdd1.rdd())).offsetRanges();
        rdd1.foreachPartition((VoidFunction<Iterator<ConsumerRecord<String, String>>>) consumerRecordIterator -> {
            OffsetRange o = offsetRanges1[TaskContext.get().partitionId()];
            System.out.println(o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
        });
    });
// stream.mapToPair((PairFunction<ConsumerRecord<String, String>, String, String>) record -> new Tuple2<>(record.key(), record.value()));
}
Also used : org.apache.spark.streaming.kafka010(org.apache.spark.streaming.kafka010) java.util(java.util) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) TaskContext(org.apache.spark.TaskContext) JavaInputDStream(org.apache.spark.streaming.api.java.JavaInputDStream) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) SparkConf(org.apache.spark.SparkConf) Durations(org.apache.spark.streaming.Durations) VoidFunction(org.apache.spark.api.java.function.VoidFunction) JavaRDD(org.apache.spark.api.java.JavaRDD) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) SparkConf(org.apache.spark.SparkConf)

Example 14 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class TestCheckpointUtils method testStringToOffsets.

@Test
public void testStringToOffsets() {
    OffsetRange[] ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] { 0, 1 }, new long[] { 200000, 250000 }), makeOffsetMap(new int[] { 0, 1 }, new long[] { 300000, 350000 }), 1000000L);
    String checkpointStr = CheckpointUtils.offsetsToStr(ranges);
    Map<TopicPartition, Long> offsetMap = CheckpointUtils.strToOffsets(checkpointStr);
    assertEquals(2, offsetMap.size());
    Set<TopicPartition> topicPartitions = new HashSet<>(2);
    TopicPartition partition0 = new TopicPartition(TEST_TOPIC_NAME, 0);
    TopicPartition partition1 = new TopicPartition(TEST_TOPIC_NAME, 1);
    topicPartitions.add(partition0);
    topicPartitions.add(partition1);
    assertEquals(topicPartitions, offsetMap.keySet());
    assertEquals(300000, offsetMap.get(partition0));
    assertEquals(350000, offsetMap.get(partition1));
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) TopicPartition(org.apache.kafka.common.TopicPartition) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 15 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class DebeziumSource method toDataset.

/**
 * Converts a Kafka Topic offset into a Spark dataset.
 *
 * @param offsetRanges Offset ranges
 * @param offsetGen    KafkaOffsetGen
 * @return Spark dataset
 */
private Dataset<Row> toDataset(OffsetRange[] offsetRanges, KafkaOffsetGen offsetGen, String schemaStr) {
    AvroConvertor convertor = new AvroConvertor(schemaStr);
    Dataset<Row> kafkaData;
    if (deserializerClassName.equals(StringDeserializer.class.getName())) {
        kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.<String, String>createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> convertor.fromJson(obj.value())).rdd(), schemaStr, sparkSession);
    } else {
        kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> (GenericRecord) obj.value()).rdd(), schemaStr, sparkSession);
    }
    // Flatten debezium payload, specific to each DB type (postgres/ mysql/ etc..)
    Dataset<Row> debeziumDataset = processDataset(kafkaData);
    // Some required transformations to ensure debezium data types are converted to spark supported types.
    return convertArrayColumnsToString(convertColumnToNullable(sparkSession, convertDateColumns(debeziumDataset, new Schema.Parser().parse(schemaStr))));
}
Also used : Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) SchemaRegistryProvider(org.apache.hudi.utilities.schema.SchemaRegistryProvider) HoodieException(org.apache.hudi.exception.HoodieException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) KafkaOffsetGen(org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) RowSource(org.apache.hudi.utilities.sources.RowSource) Logger(org.apache.log4j.Logger) LocationStrategies(org.apache.spark.streaming.kafka010.LocationStrategies) AvroConvertor(org.apache.hudi.utilities.sources.helpers.AvroConvertor) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) CheckpointUtils(org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils) Type(org.apache.avro.Schema.Type) SparkSession(org.apache.spark.sql.SparkSession) KafkaUtils(org.apache.spark.streaming.kafka010.KafkaUtils) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) Field(org.apache.avro.Schema.Field) TypedProperties(org.apache.hudi.common.config.TypedProperties) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) List(java.util.List) LogManager(org.apache.log4j.LogManager) org.apache.spark.sql.functions(org.apache.spark.sql.functions) Pair(org.apache.hudi.common.util.collection.Pair) AvroConvertor(org.apache.hudi.utilities.sources.helpers.AvroConvertor) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) Row(org.apache.spark.sql.Row) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

OffsetRange (org.apache.spark.streaming.kafka010.OffsetRange)14 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)6 Test (org.junit.jupiter.api.Test)6 HoodieException (org.apache.hudi.exception.HoodieException)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 IOException (java.io.IOException)2 Arrays (java.util.Arrays)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 TypedProperties (org.apache.hudi.common.config.TypedProperties)2 Option (org.apache.hudi.common.util.Option)2 HoodieDeltaStreamerMetrics (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics)2 HoodieSourceTimeoutException (org.apache.hudi.utilities.exception.HoodieSourceTimeoutException)2 TopicPartition (org.apache.kafka.common.TopicPartition)2 StringDeserializer (org.apache.kafka.common.serialization.StringDeserializer)2 CanCommitOffsets (org.apache.spark.streaming.kafka010.CanCommitOffsets)2 HasOffsetRanges (org.apache.spark.streaming.kafka010.HasOffsetRanges)2 IoTData (com.apssouza.iot.common.dto.IoTData)1