use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.
the class Kafka08ConsumerClient method getTopics.
@Override
public List<KafkaTopic> getTopics() {
List<TopicMetadata> topicMetadataList = getFilteredMetadataList();
List<KafkaTopic> filteredTopics = Lists.newArrayList();
for (TopicMetadata topicMetadata : topicMetadataList) {
List<KafkaPartition> partitions = getPartitionsForTopic(topicMetadata);
filteredTopics.add(new KafkaTopic(topicMetadata.topic(), partitions));
}
return filteredTopics;
}
use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.
the class KafkaAvgRecordSizeBasedWorkUnitSizeEstimator method readPreAvgRecordSizes.
private void readPreAvgRecordSizes(SourceState state) {
this.estAvgSizes.clear();
for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
List<KafkaPartition> partitions = KafkaUtils.getPartitions(workUnitState);
for (KafkaPartition partition : partitions) {
if (KafkaUtils.containsPartitionAvgRecordSize(workUnitState, partition)) {
long previousAvgSize = KafkaUtils.getPartitionAvgRecordSize(workUnitState, partition);
this.estAvgSizes.put(partition, previousAvgSize);
}
}
}
}
use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.
the class SimpleKafkaSpecConsumer method initializeHighWatermarks.
private void initializeHighWatermarks() {
try {
int i = 0;
for (KafkaPartition kafkaPartition : _partitions) {
long latestOffset = _kafkaConsumer.getLatestOffset(kafkaPartition);
_highWatermark.set(i, latestOffset);
i++;
}
} catch (KafkaOffsetRetrievalFailureException e) {
throw new RuntimeException(e);
}
}
use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.
the class SimpleKafkaSpecConsumer method initializeLowWatermarks.
private void initializeLowWatermarks() {
try {
int i = 0;
for (KafkaPartition kafkaPartition : _partitions) {
if (isFirstRun) {
long earliestOffset = _kafkaConsumer.getEarliestOffset(kafkaPartition);
_lowWatermark.set(i, earliestOffset);
} else {
_lowWatermark.set(i, _highWatermark.get(i));
}
i++;
}
isFirstRun = false;
} catch (KafkaOffsetRetrievalFailureException e) {
throw new RuntimeException(e);
}
}
use of org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition in project incubator-gobblin by apache.
the class KafkaAvgRecordTimeBasedWorkUnitSizeEstimator method readPrevAvgRecordMillis.
/**
* Get avg time to pull a record in the previous run for all topics, each of which is the geometric mean
* of the avg time to pull a record of all partitions of the topic.
*
* If a topic was not pulled in the previous run (e.g., it's a new topic), it will use the geometric mean
* of avg record time of topics that were pulled in the previous run.
*
* If no topic was pulled in the previous run, 1.0 will be used for all topics.
*/
private void readPrevAvgRecordMillis(SourceState state) {
Map<String, List<Double>> prevAvgMillis = Maps.newHashMap();
for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
List<KafkaPartition> partitions = KafkaUtils.getPartitions(workUnitState);
for (KafkaPartition partition : partitions) {
if (KafkaUtils.containsPartitionAvgRecordMillis(workUnitState, partition)) {
double prevAvgMillisForPartition = KafkaUtils.getPartitionAvgRecordMillis(workUnitState, partition);
if (prevAvgMillis.containsKey(partition.getTopicName())) {
prevAvgMillis.get(partition.getTopicName()).add(prevAvgMillisForPartition);
} else {
prevAvgMillis.put(partition.getTopicName(), Lists.newArrayList(prevAvgMillisForPartition));
}
}
}
}
this.estAvgMillis.clear();
if (prevAvgMillis.isEmpty()) {
this.avgEstAvgMillis = 1.0;
} else {
List<Double> allEstAvgMillis = Lists.newArrayList();
for (Map.Entry<String, List<Double>> entry : prevAvgMillis.entrySet()) {
String topic = entry.getKey();
List<Double> prevAvgMillisForPartitions = entry.getValue();
// If a topic has k partitions, and in the previous run, each partition recorded its avg time to pull
// a record, then use the geometric mean of these k numbers as the estimated avg time to pull
// a record in this run.
double estAvgMillisForTopic = geometricMean(prevAvgMillisForPartitions);
this.estAvgMillis.put(topic, estAvgMillisForTopic);
LOG.info(String.format("Estimated avg time to pull a record for topic %s is %f milliseconds", topic, estAvgMillisForTopic));
allEstAvgMillis.add(estAvgMillisForTopic);
}
// If a topic was not pulled in the previous run, use this.avgEstAvgMillis as the estimated avg time
// to pull a record in this run, which is the geometric mean of all topics whose avg times to pull
// a record in the previous run are known.
this.avgEstAvgMillis = geometricMean(allEstAvgMillis);
}
LOG.info("For all topics not pulled in the previous run, estimated avg time to pull a record is " + this.avgEstAvgMillis + " milliseconds");
}
Aggregations