Search in sources :

Example 1 with HoodieDeltaStreamerMetrics

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics in project hudi by apache.

the class KafkaOffsetGen method getNextOffsetRanges.

public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit, HoodieDeltaStreamerMetrics metrics) {
    // Obtain current metadata for the topic
    Map<TopicPartition, Long> fromOffsets;
    Map<TopicPartition, Long> toOffsets;
    try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) {
        if (!checkTopicExists(consumer)) {
            throw new HoodieException("Kafka topic:" + topicName + " does not exist");
        }
        List<PartitionInfo> partitionInfoList;
        partitionInfoList = consumer.partitionsFor(topicName);
        Set<TopicPartition> topicPartitions = partitionInfoList.stream().map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
        if (Config.KAFKA_CHECKPOINT_TYPE_TIMESTAMP.equals(kafkaCheckpointType) && isValidTimestampCheckpointType(lastCheckpointStr)) {
            lastCheckpointStr = getOffsetsByTimestamp(consumer, partitionInfoList, topicPartitions, topicName, Long.parseLong(lastCheckpointStr.get()));
        }
        // Determine the offset ranges to read from
        if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty() && checkTopicCheckpoint(lastCheckpointStr)) {
            fromOffsets = fetchValidOffsets(consumer, lastCheckpointStr, topicPartitions);
            metrics.updateDeltaStreamerKafkaDelayCountMetrics(delayOffsetCalculation(lastCheckpointStr, topicPartitions, consumer));
        } else {
            switch(autoResetValue) {
                case EARLIEST:
                    fromOffsets = consumer.beginningOffsets(topicPartitions);
                    break;
                case LATEST:
                    fromOffsets = consumer.endOffsets(topicPartitions);
                    break;
                case GROUP:
                    fromOffsets = getGroupOffsets(consumer, topicPartitions);
                    break;
                default:
                    throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' or 'group' ");
            }
        }
        // Obtain the latest offsets.
        toOffsets = consumer.endOffsets(topicPartitions);
    }
    // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
    long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.key(), Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.defaultValue());
    long numEvents;
    if (sourceLimit == Long.MAX_VALUE) {
        numEvents = maxEventsToReadFromKafka;
        LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka);
    } else {
        numEvents = sourceLimit;
    }
    if (numEvents < toOffsets.size()) {
        throw new HoodieException("sourceLimit should not be less than the number of kafka partitions");
    }
    return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
}
Also used : Arrays(java.util.Arrays) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataSourceUtils(org.apache.hudi.DataSourceUtils) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) Function(java.util.function.Function) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) Map(java.util.Map) AvroKafkaSource(org.apache.hudi.utilities.sources.AvroKafkaSource) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) TopicPartition(org.apache.kafka.common.TopicPartition) TimeoutException(org.apache.kafka.common.errors.TimeoutException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Set(java.util.Set) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) PartitionInfo(org.apache.kafka.common.PartitionInfo) OffsetAndTimestamp(org.apache.kafka.clients.consumer.OffsetAndTimestamp) Collectors(java.util.stream.Collectors) List(java.util.List) ConfigProperty(org.apache.hudi.common.config.ConfigProperty) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) CommitFailedException(org.apache.kafka.clients.consumer.CommitFailedException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) TopicPartition(org.apache.kafka.common.TopicPartition) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) HoodieException(org.apache.hudi.exception.HoodieException) PartitionInfo(org.apache.kafka.common.PartitionInfo) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException)

Aggregations

Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 Function (java.util.function.Function)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Collectors (java.util.stream.Collectors)1 DataSourceUtils (org.apache.hudi.DataSourceUtils)1 ConfigProperty (org.apache.hudi.common.config.ConfigProperty)1 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 Option (org.apache.hudi.common.util.Option)1 HoodieException (org.apache.hudi.exception.HoodieException)1 HoodieNotSupportedException (org.apache.hudi.exception.HoodieNotSupportedException)1 HoodieDeltaStreamerMetrics (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics)1 HoodieDeltaStreamerException (org.apache.hudi.utilities.exception.HoodieDeltaStreamerException)1