use of org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent in project wikidata-query-rdf by wikimedia.
the class KafkaPoller method fetch.
/**
* Fetch changes from Kafka.
* @param lastNextStartTime where last fetch ended up.
* @return Set of changes.
* @throws RetryableException
*/
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch fetch(Instant lastNextStartTime) throws RetryableException {
Map<String, Change> changesByTitle = new LinkedHashMap<>();
ConsumerRecords<String, ChangeEvent> records;
Instant nextInstant = Instant.EPOCH;
AtomicLongMap<String> topicCounts = AtomicLongMap.create();
Map<TopicPartition, OffsetAndMetadata> batchOffsets = new HashMap<>();
while (true) {
commitPendindOffsets();
try (Context timerContext = pollingTimer.time()) {
// TODO: make timeout configurable? Wait for a bit so we catch bursts of messages?
records = consumer.poll(1000);
} catch (InterruptException | WakeupException e) {
throw new RetryableException("Error fetching recent changes", e);
}
int count = records.count();
log.debug("Fetched {} records from Kafka", count);
changesCounter.inc(count);
if (count == 0) {
// If we got nothing from Kafka, get out of the loop and return what we have
break;
}
boolean foundSomething = false;
for (ConsumerRecord<String, ChangeEvent> record : records) {
ChangeEvent event = record.value();
String topic = record.topic();
batchOffsets.put(new TopicPartition(record.topic(), record.partition()), new OffsetAndMetadata(record.offset()));
log.trace("Got event t:{} o:{}", record.topic(), record.offset());
if (!event.domain().equals(uris.getHost())) {
// wrong domain, ignore
continue;
}
// check namespace
if (!uris.isEntityNamespace(event.namespace())) {
continue;
}
if (!(event instanceof RevisionCreateEvent)) {
log.info("Got non revision create event class:{}, domain:{}, t:{}, revision:{}", event.getClass().getSimpleName(), event.title(), event.domain(), event.revision());
}
// Now we have event that we want to process
foundSomething = true;
topicCounts.getAndIncrement(record.topic());
// very chaotic, jumping back and forth.
if (topic.endsWith(reportingTopic)) {
nextInstant = Utils.max(nextInstant, Instant.ofEpochMilli(record.timestamp()));
}
// Using offset here as RC id since we do not have real RC id (this not being RC poller) but
// the offset serves the same function in Kafka and is also useful for debugging.
Change change = makeChange(event, record.offset());
Change dupe = changesByTitle.put(change.entityId(), change);
// This is not a big deal since deletes are relatively rare.
if (dupe != null && change.revision() > Change.NO_REVISION && (dupe.revision() > change.revision() || dupe.revision() == Change.NO_REVISION)) {
// need to remove so that order will be correct
changesByTitle.remove(change.entityId());
changesByTitle.put(change.entityId(), dupe);
}
}
log.debug("{} records left after filtering", changesByTitle.size());
if (changesByTitle.size() >= batchSize) {
// We have enough for the batch
break;
}
if (changesByTitle.size() > 0 && !foundSomething) {
log.info("Did not find anything useful in this batch, returning existing data");
// wait for more.
break;
}
// TODO: if we already have something and we've spent more than X seconds in the loop,
// we probably should return without waiting for more
}
// If we didn't get anything useful in the reporting topic, keep the old value
if (nextInstant.equals(Instant.EPOCH)) {
nextInstant = lastNextStartTime;
}
final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
log.info("Found {} changes", changes.size());
if (log.isDebugEnabled()) {
topicCounts.asMap().forEach((k, v) -> log.debug("Topic {}: {} records", k, v));
}
long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextInstant);
// be sure we got the whole second
return new Batch(changes, advanced, nextInstant.minusSeconds(1).toString(), nextInstant, batchOffsets);
}
Aggregations