use of org.apache.spark.streaming.kafka010.OffsetRange in project lambda-arch by apssouza22.
the class TrafficOffsetCommitCallback method commitOffset.
/**
* Commit the ack to kafka after process have completed
*
* @param directKafkaStream
*/
private void commitOffset(JavaInputDStream<ConsumerRecord<String, IoTData>> directKafkaStream) {
directKafkaStream.foreachRDD((JavaRDD<ConsumerRecord<String, IoTData>> trafficRdd) -> {
if (!trafficRdd.isEmpty()) {
OffsetRange[] offsetRanges = ((HasOffsetRanges) trafficRdd.rdd()).offsetRanges();
CanCommitOffsets canCommitOffsets = (CanCommitOffsets) directKafkaStream.inputDStream();
canCommitOffsets.commitAsync(offsetRanges, new TrafficOffsetCommitCallback());
}
});
}
use of org.apache.spark.streaming.kafka010.OffsetRange in project lambda-arch by apssouza22.
the class TrafficOffsetCommitCallback method commitOffset.
/**
* Commit the ack to kafka after process have completed
* This is our fault-tolerance implementation
*
* @param directKafkaStream
*/
private void commitOffset(JavaInputDStream<ConsumerRecord<String, IoTData>> directKafkaStream) {
directKafkaStream.foreachRDD((JavaRDD<ConsumerRecord<String, IoTData>> trafficRdd) -> {
if (!trafficRdd.isEmpty()) {
OffsetRange[] offsetRanges = ((HasOffsetRanges) trafficRdd.rdd()).offsetRanges();
CanCommitOffsets canCommitOffsets = (CanCommitOffsets) directKafkaStream.inputDStream();
canCommitOffsets.commitAsync(offsetRanges, new TrafficOffsetCommitCallback());
}
});
}
use of org.apache.spark.streaming.kafka010.OffsetRange in project beijingThirdPeriod by weidongcao.
the class SparkStreamingKafkaDemo method main.
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("demo").setMaster("local[4]");
JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(8));
Map<String, Object> kafkaParams = new HashMap<>();
// Kafka服务监听端口
kafkaParams.put("bootstrap.servers", "cm02.spark.com:9092");
// 指定kafka输出key的数据类型及编码格式(默认为字符串类型编码格式为uft-8)
kafkaParams.put("key.deserializer", StringDeserializer.class);
// 指定kafka输出value的数据类型及编码格式(默认为字符串类型编码格式为uft-8)
kafkaParams.put("value.deserializer", StringDeserializer.class);
// 消费者ID,随意指定
kafkaParams.put("group.id", "jis");
// 指定从latest(最新,其他版本的是largest这里不行)还是smallest(最早)处开始读取数据
kafkaParams.put("auto.offset.reset", "latest");
// 如果true,consumer定期地往zookeeper写入每个分区的offset
kafkaParams.put("enable.auto.commit", false);
// 要监听的Topic
Collection<String> topics = Arrays.asList("topicA", "topicB");
// 指定偏移量
OffsetRange[] offsetRanges = { OffsetRange.create("test", 0, 0, 3), OffsetRange.create("test", 1, 0, 3) };
final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(streamingContext.sparkContext(), kafkaParams, offsetRanges, LocationStrategies.PreferConsistent());
stream.foreachRDD((VoidFunction<JavaRDD<ConsumerRecord<String, String>>>) rdd1 -> {
final OffsetRange[] offsetRanges1 = ((HasOffsetRanges) (rdd1.rdd())).offsetRanges();
rdd1.foreachPartition((VoidFunction<Iterator<ConsumerRecord<String, String>>>) consumerRecordIterator -> {
OffsetRange o = offsetRanges1[TaskContext.get().partitionId()];
System.out.println(o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
});
});
// stream.mapToPair((PairFunction<ConsumerRecord<String, String>, String, String>) record -> new Tuple2<>(record.key(), record.value()));
}
use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.
the class TestCheckpointUtils method testStringToOffsets.
@Test
public void testStringToOffsets() {
OffsetRange[] ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] { 0, 1 }, new long[] { 200000, 250000 }), makeOffsetMap(new int[] { 0, 1 }, new long[] { 300000, 350000 }), 1000000L);
String checkpointStr = CheckpointUtils.offsetsToStr(ranges);
Map<TopicPartition, Long> offsetMap = CheckpointUtils.strToOffsets(checkpointStr);
assertEquals(2, offsetMap.size());
Set<TopicPartition> topicPartitions = new HashSet<>(2);
TopicPartition partition0 = new TopicPartition(TEST_TOPIC_NAME, 0);
TopicPartition partition1 = new TopicPartition(TEST_TOPIC_NAME, 1);
topicPartitions.add(partition0);
topicPartitions.add(partition1);
assertEquals(topicPartitions, offsetMap.keySet());
assertEquals(300000, offsetMap.get(partition0));
assertEquals(350000, offsetMap.get(partition1));
}
use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.
the class DebeziumSource method toDataset.
/**
* Converts a Kafka Topic offset into a Spark dataset.
*
* @param offsetRanges Offset ranges
* @param offsetGen KafkaOffsetGen
* @return Spark dataset
*/
private Dataset<Row> toDataset(OffsetRange[] offsetRanges, KafkaOffsetGen offsetGen, String schemaStr) {
AvroConvertor convertor = new AvroConvertor(schemaStr);
Dataset<Row> kafkaData;
if (deserializerClassName.equals(StringDeserializer.class.getName())) {
kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.<String, String>createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> convertor.fromJson(obj.value())).rdd(), schemaStr, sparkSession);
} else {
kafkaData = AvroConversionUtils.createDataFrame(KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()).map(obj -> (GenericRecord) obj.value()).rdd(), schemaStr, sparkSession);
}
// Flatten debezium payload, specific to each DB type (postgres/ mysql/ etc..)
Dataset<Row> debeziumDataset = processDataset(kafkaData);
// Some required transformations to ensure debezium data types are converted to spark supported types.
return convertArrayColumnsToString(convertColumnToNullable(sparkSession, convertDateColumns(debeziumDataset, new Schema.Parser().parse(schemaStr))));
}
Aggregations