Search in sources :

Example 1 with KafkaPartition

use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.

the class Kafka08ConsumerClient method getTopics.

@Override
public List<KafkaTopic> getTopics() {
    List<TopicMetadata> topicMetadataList = getFilteredMetadataList();
    List<KafkaTopic> filteredTopics = Lists.newArrayList();
    for (TopicMetadata topicMetadata : topicMetadataList) {
        List<KafkaPartition> partitions = getPartitionsForTopic(topicMetadata);
        filteredTopics.add(new KafkaTopic(topicMetadata.topic(), partitions));
    }
    return filteredTopics;
}
Also used : KafkaTopic(org.apache.gobblin.source.extractor.extract.kafka.KafkaTopic) KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition) TopicMetadata(kafka.javaapi.TopicMetadata)

Example 2 with KafkaPartition

use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.

the class KafkaAvgRecordSizeBasedWorkUnitSizeEstimator method readPreAvgRecordSizes.

private void readPreAvgRecordSizes(SourceState state) {
    this.estAvgSizes.clear();
    for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
        List<KafkaPartition> partitions = KafkaUtils.getPartitions(workUnitState);
        for (KafkaPartition partition : partitions) {
            if (KafkaUtils.containsPartitionAvgRecordSize(workUnitState, partition)) {
                long previousAvgSize = KafkaUtils.getPartitionAvgRecordSize(workUnitState, partition);
                this.estAvgSizes.put(partition, previousAvgSize);
            }
        }
    }
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition)

Example 3 with KafkaPartition

use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.

the class SimpleKafkaSpecConsumer method initializeHighWatermarks.

private void initializeHighWatermarks() {
    try {
        int i = 0;
        for (KafkaPartition kafkaPartition : _partitions) {
            long latestOffset = _kafkaConsumer.getLatestOffset(kafkaPartition);
            _highWatermark.set(i, latestOffset);
            i++;
        }
    } catch (KafkaOffsetRetrievalFailureException e) {
        throw new RuntimeException(e);
    }
}
Also used : KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition) KafkaOffsetRetrievalFailureException(org.apache.gobblin.source.extractor.extract.kafka.KafkaOffsetRetrievalFailureException)

Example 4 with KafkaPartition

use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.

the class SimpleKafkaSpecConsumer method initializeLowWatermarks.

private void initializeLowWatermarks() {
    try {
        int i = 0;
        for (KafkaPartition kafkaPartition : _partitions) {
            if (isFirstRun) {
                long earliestOffset = _kafkaConsumer.getEarliestOffset(kafkaPartition);
                _lowWatermark.set(i, earliestOffset);
            } else {
                _lowWatermark.set(i, _highWatermark.get(i));
            }
            i++;
        }
        isFirstRun = false;
    } catch (KafkaOffsetRetrievalFailureException e) {
        throw new RuntimeException(e);
    }
}
Also used : KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition) KafkaOffsetRetrievalFailureException(org.apache.gobblin.source.extractor.extract.kafka.KafkaOffsetRetrievalFailureException)

Example 5 with KafkaPartition

use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.

the class KafkaAvgRecordTimeBasedWorkUnitSizeEstimator method readPrevAvgRecordMillis.

/**
 * Get avg time to pull a record in the previous run for all topics, each of which is the geometric mean
 * of the avg time to pull a record of all partitions of the topic.
 *
 * If a topic was not pulled in the previous run (e.g., it's a new topic), it will use the geometric mean
 * of avg record time of topics that were pulled in the previous run.
 *
 * If no topic was pulled in the previous run, 1.0 will be used for all topics.
 */
private void readPrevAvgRecordMillis(SourceState state) {
    Map<String, List<Double>> prevAvgMillis = Maps.newHashMap();
    for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
        List<KafkaPartition> partitions = KafkaUtils.getPartitions(workUnitState);
        for (KafkaPartition partition : partitions) {
            if (KafkaUtils.containsPartitionAvgRecordMillis(workUnitState, partition)) {
                double prevAvgMillisForPartition = KafkaUtils.getPartitionAvgRecordMillis(workUnitState, partition);
                if (prevAvgMillis.containsKey(partition.getTopicName())) {
                    prevAvgMillis.get(partition.getTopicName()).add(prevAvgMillisForPartition);
                } else {
                    prevAvgMillis.put(partition.getTopicName(), Lists.newArrayList(prevAvgMillisForPartition));
                }
            }
        }
    }
    this.estAvgMillis.clear();
    if (prevAvgMillis.isEmpty()) {
        this.avgEstAvgMillis = 1.0;
    } else {
        List<Double> allEstAvgMillis = Lists.newArrayList();
        for (Map.Entry<String, List<Double>> entry : prevAvgMillis.entrySet()) {
            String topic = entry.getKey();
            List<Double> prevAvgMillisForPartitions = entry.getValue();
            // If a topic has k partitions, and in the previous run, each partition recorded its avg time to pull
            // a record, then use the geometric mean of these k numbers as the estimated avg time to pull
            // a record in this run.
            double estAvgMillisForTopic = geometricMean(prevAvgMillisForPartitions);
            this.estAvgMillis.put(topic, estAvgMillisForTopic);
            LOG.info(String.format("Estimated avg time to pull a record for topic %s is %f milliseconds", topic, estAvgMillisForTopic));
            allEstAvgMillis.add(estAvgMillisForTopic);
        }
        // If a topic was not pulled in the previous run, use this.avgEstAvgMillis as the estimated avg time
        // to pull a record in this run, which is the geometric mean of all topics whose avg times to pull
        // a record in the previous run are known.
        this.avgEstAvgMillis = geometricMean(allEstAvgMillis);
    }
    LOG.info("For all topics not pulled in the previous run, estimated avg time to pull a record is " + this.avgEstAvgMillis + " milliseconds");
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition) List(java.util.List) Map(java.util.Map)

Aggregations

KafkaPartition (org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 KafkaOffsetRetrievalFailureException (org.apache.gobblin.source.extractor.extract.kafka.KafkaOffsetRetrievalFailureException)2 List (java.util.List)1 Map (java.util.Map)1 TopicMetadata (kafka.javaapi.TopicMetadata)1 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)1 KafkaTopic (org.apache.gobblin.source.extractor.extract.kafka.KafkaTopic)1 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)1 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)1