Search in sources :

Example 1 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class KafkaOffsetGen method getNextOffsetRanges.

public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit, HoodieDeltaStreamerMetrics metrics) {
    // Obtain current metadata for the topic
    Map<TopicPartition, Long> fromOffsets;
    Map<TopicPartition, Long> toOffsets;
    try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) {
        if (!checkTopicExists(consumer)) {
            throw new HoodieException("Kafka topic:" + topicName + " does not exist");
        }
        List<PartitionInfo> partitionInfoList;
        partitionInfoList = consumer.partitionsFor(topicName);
        Set<TopicPartition> topicPartitions = partitionInfoList.stream().map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
        if (Config.KAFKA_CHECKPOINT_TYPE_TIMESTAMP.equals(kafkaCheckpointType) && isValidTimestampCheckpointType(lastCheckpointStr)) {
            lastCheckpointStr = getOffsetsByTimestamp(consumer, partitionInfoList, topicPartitions, topicName, Long.parseLong(lastCheckpointStr.get()));
        }
        // Determine the offset ranges to read from
        if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty() && checkTopicCheckpoint(lastCheckpointStr)) {
            fromOffsets = fetchValidOffsets(consumer, lastCheckpointStr, topicPartitions);
            metrics.updateDeltaStreamerKafkaDelayCountMetrics(delayOffsetCalculation(lastCheckpointStr, topicPartitions, consumer));
        } else {
            switch(autoResetValue) {
                case EARLIEST:
                    fromOffsets = consumer.beginningOffsets(topicPartitions);
                    break;
                case LATEST:
                    fromOffsets = consumer.endOffsets(topicPartitions);
                    break;
                case GROUP:
                    fromOffsets = getGroupOffsets(consumer, topicPartitions);
                    break;
                default:
                    throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' or 'group' ");
            }
        }
        // Obtain the latest offsets.
        toOffsets = consumer.endOffsets(topicPartitions);
    }
    // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
    long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.key(), Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.defaultValue());
    long numEvents;
    if (sourceLimit == Long.MAX_VALUE) {
        numEvents = maxEventsToReadFromKafka;
        LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka);
    } else {
        numEvents = sourceLimit;
    }
    if (numEvents < toOffsets.size()) {
        throw new HoodieException("sourceLimit should not be less than the number of kafka partitions");
    }
    return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
}
Also used : Arrays(java.util.Arrays) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataSourceUtils(org.apache.hudi.DataSourceUtils) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) Function(java.util.function.Function) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) Map(java.util.Map) AvroKafkaSource(org.apache.hudi.utilities.sources.AvroKafkaSource) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) TopicPartition(org.apache.kafka.common.TopicPartition) TimeoutException(org.apache.kafka.common.errors.TimeoutException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Set(java.util.Set) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) PartitionInfo(org.apache.kafka.common.PartitionInfo) OffsetAndTimestamp(org.apache.kafka.clients.consumer.OffsetAndTimestamp) Collectors(java.util.stream.Collectors) List(java.util.List) ConfigProperty(org.apache.hudi.common.config.ConfigProperty) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) CommitFailedException(org.apache.kafka.clients.consumer.CommitFailedException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) TopicPartition(org.apache.kafka.common.TopicPartition) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) HoodieException(org.apache.hudi.exception.HoodieException) PartitionInfo(org.apache.kafka.common.PartitionInfo) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException)

Example 2 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class AvroKafkaSource method fetchNewData.

@Override
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics);
        long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
        LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
        if (totalNewMsgs <= 0) {
            return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges));
        }
        JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges);
        return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
    } catch (org.apache.kafka.common.errors.TimeoutException e) {
        throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage());
    }
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 3 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class JsonKafkaSource method fetchNewData.

@Override
protected InputBatch<JavaRDD<String>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics);
        long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
        LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
        if (totalNewMsgs <= 0) {
            return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges));
        }
        JavaRDD<String> newDataRDD = toRDD(offsetRanges);
        return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
    } catch (org.apache.kafka.common.errors.TimeoutException e) {
        throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage());
    }
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException)

Example 4 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class DebeziumSource method fetchNextBatch.

@Override
protected Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {
    String overrideCheckpointStr = props.getString(OVERRIDE_CHECKPOINT_STRING, "");
    OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCkptStr, sourceLimit, metrics);
    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
    LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
    if (totalNewMsgs == 0) {
        // up to date if a change event has occurred.
        return Pair.of(Option.of(sparkSession.emptyDataFrame()), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
    } else {
        try {
            String schemaStr = schemaRegistryProvider.fetchSchemaFromRegistry(props.getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP));
            Dataset<Row> dataset = toDataset(offsetRanges, offsetGen, schemaStr);
            LOG.info(String.format("Spark schema of Kafka Payload for topic %s:\n%s", offsetGen.getTopicName(), dataset.schema().treeString()));
            LOG.info(String.format("New checkpoint string: %s", CheckpointUtils.offsetsToStr(offsetRanges)));
            return Pair.of(Option.of(dataset), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
        } catch (IOException exc) {
            LOG.error("Fatal error reading and parsing incoming debezium event", exc);
            throw new HoodieException("Fatal error reading and parsing incoming debezium event", exc);
        }
    }
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieException(org.apache.hudi.exception.HoodieException) Row(org.apache.spark.sql.Row) IOException(java.io.IOException)

Example 5 with OffsetRange

use of org.apache.spark.streaming.kafka010.OffsetRange in project hudi by apache.

the class TestKafkaOffsetGen method testGetNextOffsetRangesFromTimestampCheckpointType.

@Test
public void testGetNextOffsetRangesFromTimestampCheckpointType() {
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.createTopic(TEST_TOPIC_NAME, 1);
    testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    KafkaOffsetGen kafkaOffsetGen = new KafkaOffsetGen(getConsumerConfigs("latest", "timestamp"));
    OffsetRange[] nextOffsetRanges = kafkaOffsetGen.getNextOffsetRanges(Option.of(String.valueOf(System.currentTimeMillis() - 100000)), 500, metrics);
    assertEquals(1, nextOffsetRanges.length);
    assertEquals(0, nextOffsetRanges[0].fromOffset());
    assertEquals(500, nextOffsetRanges[0].untilOffset());
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Aggregations

OffsetRange (org.apache.spark.streaming.kafka010.OffsetRange)14 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)6 Test (org.junit.jupiter.api.Test)6 HoodieException (org.apache.hudi.exception.HoodieException)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 IOException (java.io.IOException)2 Arrays (java.util.Arrays)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 TypedProperties (org.apache.hudi.common.config.TypedProperties)2 Option (org.apache.hudi.common.util.Option)2 HoodieDeltaStreamerMetrics (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics)2 HoodieSourceTimeoutException (org.apache.hudi.utilities.exception.HoodieSourceTimeoutException)2 TopicPartition (org.apache.kafka.common.TopicPartition)2 StringDeserializer (org.apache.kafka.common.serialization.StringDeserializer)2 CanCommitOffsets (org.apache.spark.streaming.kafka010.CanCommitOffsets)2 HasOffsetRanges (org.apache.spark.streaming.kafka010.HasOffsetRanges)2 IoTData (com.apssouza.iot.common.dto.IoTData)1