Search in sources :

Example 1 with RevisionCreateEvent

use of org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent in project wikidata-query-rdf by wikimedia.

the class KafkaPoller method fetch.

/**
 * Fetch changes from Kafka.
 * @param lastNextStartTime where last fetch ended up.
 * @return Set of changes.
 * @throws RetryableException
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch fetch(Instant lastNextStartTime) throws RetryableException {
    Map<String, Change> changesByTitle = new LinkedHashMap<>();
    ConsumerRecords<String, ChangeEvent> records;
    Instant nextInstant = Instant.EPOCH;
    AtomicLongMap<String> topicCounts = AtomicLongMap.create();
    Map<TopicPartition, OffsetAndMetadata> batchOffsets = new HashMap<>();
    while (true) {
        commitPendindOffsets();
        try (Context timerContext = pollingTimer.time()) {
            // TODO: make timeout configurable? Wait for a bit so we catch bursts of messages?
            records = consumer.poll(1000);
        } catch (InterruptException | WakeupException e) {
            throw new RetryableException("Error fetching recent changes", e);
        }
        int count = records.count();
        log.debug("Fetched {} records from Kafka", count);
        changesCounter.inc(count);
        if (count == 0) {
            // If we got nothing from Kafka, get out of the loop and return what we have
            break;
        }
        boolean foundSomething = false;
        for (ConsumerRecord<String, ChangeEvent> record : records) {
            ChangeEvent event = record.value();
            String topic = record.topic();
            batchOffsets.put(new TopicPartition(record.topic(), record.partition()), new OffsetAndMetadata(record.offset()));
            log.trace("Got event t:{} o:{}", record.topic(), record.offset());
            if (!event.domain().equals(uris.getHost())) {
                // wrong domain, ignore
                continue;
            }
            // check namespace
            if (!uris.isEntityNamespace(event.namespace())) {
                continue;
            }
            if (!(event instanceof RevisionCreateEvent)) {
                log.info("Got non revision create event class:{}, domain:{}, t:{}, revision:{}", event.getClass().getSimpleName(), event.title(), event.domain(), event.revision());
            }
            // Now we have event that we want to process
            foundSomething = true;
            topicCounts.getAndIncrement(record.topic());
            // very chaotic, jumping back and forth.
            if (topic.endsWith(reportingTopic)) {
                nextInstant = Utils.max(nextInstant, Instant.ofEpochMilli(record.timestamp()));
            }
            // Using offset here as RC id since we do not have real RC id (this not being RC poller) but
            // the offset serves the same function in Kafka and is also useful for debugging.
            Change change = makeChange(event, record.offset());
            Change dupe = changesByTitle.put(change.entityId(), change);
            // This is not a big deal since deletes are relatively rare.
            if (dupe != null && change.revision() > Change.NO_REVISION && (dupe.revision() > change.revision() || dupe.revision() == Change.NO_REVISION)) {
                // need to remove so that order will be correct
                changesByTitle.remove(change.entityId());
                changesByTitle.put(change.entityId(), dupe);
            }
        }
        log.debug("{} records left after filtering", changesByTitle.size());
        if (changesByTitle.size() >= batchSize) {
            // We have enough for the batch
            break;
        }
        if (changesByTitle.size() > 0 && !foundSomething) {
            log.info("Did not find anything useful in this batch, returning existing data");
            // wait for more.
            break;
        }
    // TODO: if we already have something and we've spent more than X seconds in the loop,
    // we probably should return without waiting for more
    }
    // If we didn't get anything useful in the reporting topic, keep the old value
    if (nextInstant.equals(Instant.EPOCH)) {
        nextInstant = lastNextStartTime;
    }
    final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
    log.info("Found {} changes", changes.size());
    if (log.isDebugEnabled()) {
        topicCounts.asMap().forEach((k, v) -> log.debug("Topic {}: {} records", k, v));
    }
    long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextInstant);
    // be sure we got the whole second
    return new Batch(changes, advanced, nextInstant.minusSeconds(1).toString(), nextInstant, batchOffsets);
}
Also used : Context(com.codahale.metrics.Timer.Context) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Instant(java.time.Instant) InterruptException(org.apache.kafka.common.errors.InterruptException) WakeupException(org.apache.kafka.common.errors.WakeupException) LinkedHashMap(java.util.LinkedHashMap) RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) TopicPartition(org.apache.kafka.common.TopicPartition) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) RevisionCreateEvent(org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent)

Aggregations

Context (com.codahale.metrics.Timer.Context)1 Instant (java.time.Instant)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1 OffsetAndMetadata (org.apache.kafka.clients.consumer.OffsetAndMetadata)1 TopicPartition (org.apache.kafka.common.TopicPartition)1 InterruptException (org.apache.kafka.common.errors.InterruptException)1 WakeupException (org.apache.kafka.common.errors.WakeupException)1 ChangeEvent (org.wikidata.query.rdf.tool.change.events.ChangeEvent)1 RevisionCreateEvent (org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent)1 RetryableException (org.wikidata.query.rdf.tool.exception.RetryableException)1