Search in sources :

Example 1 with ChangeEvent

use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.

the class KafkaPoller method fetch.

/**
 * Fetch changes from Kafka.
 * @param lastNextStartTime where last fetch ended up.
 * @return Set of changes.
 * @throws RetryableException
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch fetch(Instant lastNextStartTime) throws RetryableException {
    Map<String, Change> changesByTitle = new LinkedHashMap<>();
    ConsumerRecords<String, ChangeEvent> records;
    Instant nextInstant = Instant.EPOCH;
    AtomicLongMap<String> topicCounts = AtomicLongMap.create();
    Map<TopicPartition, OffsetAndMetadata> batchOffsets = new HashMap<>();
    while (true) {
        commitPendindOffsets();
        try (Context timerContext = pollingTimer.time()) {
            // TODO: make timeout configurable? Wait for a bit so we catch bursts of messages?
            records = consumer.poll(1000);
        } catch (InterruptException | WakeupException e) {
            throw new RetryableException("Error fetching recent changes", e);
        }
        int count = records.count();
        log.debug("Fetched {} records from Kafka", count);
        changesCounter.inc(count);
        if (count == 0) {
            // If we got nothing from Kafka, get out of the loop and return what we have
            break;
        }
        boolean foundSomething = false;
        for (ConsumerRecord<String, ChangeEvent> record : records) {
            ChangeEvent event = record.value();
            String topic = record.topic();
            batchOffsets.put(new TopicPartition(record.topic(), record.partition()), new OffsetAndMetadata(record.offset()));
            log.trace("Got event t:{} o:{}", record.topic(), record.offset());
            if (!event.domain().equals(uris.getHost())) {
                // wrong domain, ignore
                continue;
            }
            // check namespace
            if (!uris.isEntityNamespace(event.namespace())) {
                continue;
            }
            if (!(event instanceof RevisionCreateEvent)) {
                log.info("Got non revision create event class:{}, domain:{}, t:{}, revision:{}", event.getClass().getSimpleName(), event.title(), event.domain(), event.revision());
            }
            // Now we have event that we want to process
            foundSomething = true;
            topicCounts.getAndIncrement(record.topic());
            // very chaotic, jumping back and forth.
            if (topic.endsWith(reportingTopic)) {
                nextInstant = Utils.max(nextInstant, Instant.ofEpochMilli(record.timestamp()));
            }
            // Using offset here as RC id since we do not have real RC id (this not being RC poller) but
            // the offset serves the same function in Kafka and is also useful for debugging.
            Change change = makeChange(event, record.offset());
            Change dupe = changesByTitle.put(change.entityId(), change);
            // This is not a big deal since deletes are relatively rare.
            if (dupe != null && change.revision() > Change.NO_REVISION && (dupe.revision() > change.revision() || dupe.revision() == Change.NO_REVISION)) {
                // need to remove so that order will be correct
                changesByTitle.remove(change.entityId());
                changesByTitle.put(change.entityId(), dupe);
            }
        }
        log.debug("{} records left after filtering", changesByTitle.size());
        if (changesByTitle.size() >= batchSize) {
            // We have enough for the batch
            break;
        }
        if (changesByTitle.size() > 0 && !foundSomething) {
            log.info("Did not find anything useful in this batch, returning existing data");
            // wait for more.
            break;
        }
    // TODO: if we already have something and we've spent more than X seconds in the loop,
    // we probably should return without waiting for more
    }
    // If we didn't get anything useful in the reporting topic, keep the old value
    if (nextInstant.equals(Instant.EPOCH)) {
        nextInstant = lastNextStartTime;
    }
    final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
    log.info("Found {} changes", changes.size());
    if (log.isDebugEnabled()) {
        topicCounts.asMap().forEach((k, v) -> log.debug("Topic {}: {} records", k, v));
    }
    long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextInstant);
    // be sure we got the whole second
    return new Batch(changes, advanced, nextInstant.minusSeconds(1).toString(), nextInstant, batchOffsets);
}
Also used : Context(com.codahale.metrics.Timer.Context) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Instant(java.time.Instant) InterruptException(org.apache.kafka.common.errors.InterruptException) WakeupException(org.apache.kafka.common.errors.WakeupException) LinkedHashMap(java.util.LinkedHashMap) RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) TopicPartition(org.apache.kafka.common.TopicPartition) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) RevisionCreateEvent(org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent)

Example 2 with ChangeEvent

use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.

the class KafkaPollerUnitTest method multiPolls2.

@Test
public void multiPolls2() throws RetryableException {
    KafkaPoller poller = makePoller();
    ConsumerRecords<String, ChangeEvent> rs1 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(20), 1, 5, "Q123"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
    ConsumerRecords<String, ChangeEvent> rs2 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 4, 1, "Q234"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
    ConsumerRecords<String, ChangeEvent> rs3 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 4, 2, "Q234"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 10, "Q123"), "othertopic", Duration.ofMillis(31)), makeRecord(makeRCEvent(Duration.ofMillis(30), 5, 21, "Q245"), "topictest", Duration.ofMillis(40)));
    when(consumer.poll(anyLong())).thenReturn(rs1, rs2, rs3, EMPTY_CHANGES);
    Batch batch = poller.firstBatch();
    // If all three had good events, all three should be in the batch
    assertThat(batch.changes()).hasSize(3).anyMatch(titleRevision("Q123", 10)).anyMatch(titleRevision("Q234", 2)).anyMatch(titleRevision("Q245", 21));
}
Also used : ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) Batch(org.wikidata.query.rdf.tool.change.KafkaPoller.Batch) Test(org.junit.Test)

Example 3 with ChangeEvent

use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.

the class KafkaPollerUnitTest method batchSize.

@Test
public void batchSize() throws RetryableException {
    KafkaPoller poller = makePoller();
    ConsumerRecords<String, ChangeEvent> rs1 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(20), 1, 5, "Q1"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(20), 4, 5, "Q2"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(20), 5, 5, "Q3"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(20), 6, 5, "Q4"), "topictest", Duration.ofMillis(20)));
    ConsumerRecords<String, ChangeEvent> rs2 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 5, 10, "Q3"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 20, "Q1"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 100, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 4, 20, "Q2"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 20, "Q1"), "othertopic", Duration.ofMillis(21)));
    ConsumerRecords<String, ChangeEvent> rs3 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 5, 100, "Q3"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 200, "Q1"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 100, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 7, 200, "Q5"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 8, 200, "Q6"), "othertopic", Duration.ofMillis(21)));
    ConsumerRecords<String, ChangeEvent> rs4 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 9, 2, "Q7"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
    when(consumer.poll(anyLong())).thenReturn(rs1, rs2, rs3, rs4, EMPTY_CHANGES);
    Batch batch = poller.firstBatch();
    // The batch should stop as soon as we got over size 5
    assertThat(batch.changes()).hasSize(6).anyMatch(titleRevision("Q1", 200)).anyMatch(titleRevision("Q2", 20)).anyMatch(titleRevision("Q3", 100)).anyMatch(titleRevision("Q4", 5)).anyMatch(titleRevision("Q5", 200)).anyMatch(titleRevision("Q6", 200)).noneMatch(title("Q7"));
}
Also used : ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) Batch(org.wikidata.query.rdf.tool.change.KafkaPoller.Batch) Test(org.junit.Test)

Example 4 with ChangeEvent

use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.

the class KafkaPollerUnitTest method filterOtherChanges.

@Test
public void filterOtherChanges() throws RetryableException {
    ConsumerRecords<String, ChangeEvent> rs = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(20), 1, 5, "Q123"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
    Batch batch = getBatchFromRecords(rs);
    // There should be only one change, and it should have max revision
    assertThat(batch.changes()).hasSize(1).anyMatch(title("Q123"));
}
Also used : ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) Batch(org.wikidata.query.rdf.tool.change.KafkaPoller.Batch) Test(org.junit.Test)

Example 5 with ChangeEvent

use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.

the class KafkaPollerUnitTest method writeOffsets.

@Test
public void writeOffsets() throws RetryableException {
    // Scenario where all offsets are loaded from both storage and timestamp
    Collection<String> topics = ImmutableList.of("topictest", "othertopic", "thirdtopic");
    KafkaOffsetsRepository offsetsRepository = mock(KafkaOffsetsRepository.class);
    Map<TopicPartition, List<ConsumerRecord<String, ChangeEvent>>> records = new HashMap<>();
    records.put(new TopicPartition("topictest", 0), Arrays.asList(new ConsumerRecord<>("topictest", 0, 2L, "1", newChange("Q1")), new ConsumerRecord<>("topictest", 0, 2L, "4", newChange("Q4"))));
    records.put(new TopicPartition("othertopic", 0), Arrays.asList(new ConsumerRecord<>("othertopic", 0, 2L, "2", newChange("Q2")), new ConsumerRecord<>("othertopic", 0, 3L, "5", newChange("Q5"))));
    records.put(new TopicPartition("thirdtopic", 0), singletonList(new ConsumerRecord<>("thirdtopic", 0, 2L, "3", newChange("Q3"))));
    createTopicPartitions(1);
    when(offsetsRepository.load(any())).thenReturn(ImmutableMap.of());
    when(consumer.poll(anyLong())).thenReturn(new ConsumerRecords<>(records));
    ArgumentCaptor<Map<TopicPartition, OffsetAndMetadata>> storeCaptor = ArgumentCaptor.forClass((Class) Map.class);
    ArgumentCaptor<Map<TopicPartition, OffsetAndMetadata>> kafkaAsyncStoreCaptor = ArgumentCaptor.forClass((Class) Map.class);
    ArgumentCaptor<Map<TopicPartition, OffsetAndMetadata>> kafkaSyncStoreCaptor = ArgumentCaptor.forClass((Class) Map.class);
    doNothing().when(offsetsRepository).store(storeCaptor.capture());
    doNothing().when(consumer).commitAsync(kafkaAsyncStoreCaptor.capture(), any());
    doNothing().when(consumer).commitSync(kafkaSyncStoreCaptor.capture());
    KafkaPoller poller = new KafkaPoller(consumer, uris, START_TIME, BATCH_SIZE, topics, offsetsRepository, true, new MetricRegistry());
    Batch batch = poller.firstBatch();
    poller.done(batch);
    // Should be one update query
    verify(offsetsRepository, times(1)).store(any());
    assertThat(storeCaptor.getValue()).containsEntry(new TopicPartition("topictest", 0), new OffsetAndMetadata(2L)).containsEntry(new TopicPartition("othertopic", 0), new OffsetAndMetadata(3L)).containsEntry(new TopicPartition("thirdtopic", 0), new OffsetAndMetadata(2L));
    poller.nextBatch(batch);
    assertThat(kafkaAsyncStoreCaptor.getValue()).containsEntry(new TopicPartition("topictest", 0), new OffsetAndMetadata(2L)).containsEntry(new TopicPartition("othertopic", 0), new OffsetAndMetadata(3L)).containsEntry(new TopicPartition("thirdtopic", 0), new OffsetAndMetadata(2L));
    poller.done(batch);
    // Verify that the last offsets are committed synchronously when closing
    poller.close();
    assertThat(kafkaSyncStoreCaptor.getValue()).containsEntry(new TopicPartition("topictest", 0), new OffsetAndMetadata(2L)).containsEntry(new TopicPartition("othertopic", 0), new OffsetAndMetadata(3L)).containsEntry(new TopicPartition("thirdtopic", 0), new OffsetAndMetadata(2L));
}
Also used : HashMap(java.util.HashMap) MetricRegistry(com.codahale.metrics.MetricRegistry) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) Batch(org.wikidata.query.rdf.tool.change.KafkaPoller.Batch) TopicPartition(org.apache.kafka.common.TopicPartition) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) Collections.singletonList(java.util.Collections.singletonList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Test(org.junit.Test)

Aggregations

ChangeEvent (org.wikidata.query.rdf.tool.change.events.ChangeEvent)12 Test (org.junit.Test)9 Batch (org.wikidata.query.rdf.tool.change.KafkaPoller.Batch)9 MetricRegistry (com.codahale.metrics.MetricRegistry)2 HashMap (java.util.HashMap)2 OffsetAndMetadata (org.apache.kafka.clients.consumer.OffsetAndMetadata)2 TopicPartition (org.apache.kafka.common.TopicPartition)2 Context (com.codahale.metrics.Timer.Context)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 Instant (java.time.Instant)1 ArrayList (java.util.ArrayList)1 Collections.singletonList (java.util.Collections.singletonList)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Map (java.util.Map)1 Collectors.toList (java.util.stream.Collectors.toList)1 ConsumerRecord (org.apache.kafka.clients.consumer.ConsumerRecord)1