use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.
the class KafkaPoller method fetch.
/**
* Fetch changes from Kafka.
* @param lastNextStartTime where last fetch ended up.
* @return Set of changes.
* @throws RetryableException
*/
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch fetch(Instant lastNextStartTime) throws RetryableException {
Map<String, Change> changesByTitle = new LinkedHashMap<>();
ConsumerRecords<String, ChangeEvent> records;
Instant nextInstant = Instant.EPOCH;
AtomicLongMap<String> topicCounts = AtomicLongMap.create();
Map<TopicPartition, OffsetAndMetadata> batchOffsets = new HashMap<>();
while (true) {
commitPendindOffsets();
try (Context timerContext = pollingTimer.time()) {
// TODO: make timeout configurable? Wait for a bit so we catch bursts of messages?
records = consumer.poll(1000);
} catch (InterruptException | WakeupException e) {
throw new RetryableException("Error fetching recent changes", e);
}
int count = records.count();
log.debug("Fetched {} records from Kafka", count);
changesCounter.inc(count);
if (count == 0) {
// If we got nothing from Kafka, get out of the loop and return what we have
break;
}
boolean foundSomething = false;
for (ConsumerRecord<String, ChangeEvent> record : records) {
ChangeEvent event = record.value();
String topic = record.topic();
batchOffsets.put(new TopicPartition(record.topic(), record.partition()), new OffsetAndMetadata(record.offset()));
log.trace("Got event t:{} o:{}", record.topic(), record.offset());
if (!event.domain().equals(uris.getHost())) {
// wrong domain, ignore
continue;
}
// check namespace
if (!uris.isEntityNamespace(event.namespace())) {
continue;
}
if (!(event instanceof RevisionCreateEvent)) {
log.info("Got non revision create event class:{}, domain:{}, t:{}, revision:{}", event.getClass().getSimpleName(), event.title(), event.domain(), event.revision());
}
// Now we have event that we want to process
foundSomething = true;
topicCounts.getAndIncrement(record.topic());
// very chaotic, jumping back and forth.
if (topic.endsWith(reportingTopic)) {
nextInstant = Utils.max(nextInstant, Instant.ofEpochMilli(record.timestamp()));
}
// Using offset here as RC id since we do not have real RC id (this not being RC poller) but
// the offset serves the same function in Kafka and is also useful for debugging.
Change change = makeChange(event, record.offset());
Change dupe = changesByTitle.put(change.entityId(), change);
// This is not a big deal since deletes are relatively rare.
if (dupe != null && change.revision() > Change.NO_REVISION && (dupe.revision() > change.revision() || dupe.revision() == Change.NO_REVISION)) {
// need to remove so that order will be correct
changesByTitle.remove(change.entityId());
changesByTitle.put(change.entityId(), dupe);
}
}
log.debug("{} records left after filtering", changesByTitle.size());
if (changesByTitle.size() >= batchSize) {
// We have enough for the batch
break;
}
if (changesByTitle.size() > 0 && !foundSomething) {
log.info("Did not find anything useful in this batch, returning existing data");
// wait for more.
break;
}
// TODO: if we already have something and we've spent more than X seconds in the loop,
// we probably should return without waiting for more
}
// If we didn't get anything useful in the reporting topic, keep the old value
if (nextInstant.equals(Instant.EPOCH)) {
nextInstant = lastNextStartTime;
}
final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
log.info("Found {} changes", changes.size());
if (log.isDebugEnabled()) {
topicCounts.asMap().forEach((k, v) -> log.debug("Topic {}: {} records", k, v));
}
long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextInstant);
// be sure we got the whole second
return new Batch(changes, advanced, nextInstant.minusSeconds(1).toString(), nextInstant, batchOffsets);
}
use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.
the class KafkaPollerUnitTest method multiPolls2.
@Test
public void multiPolls2() throws RetryableException {
KafkaPoller poller = makePoller();
ConsumerRecords<String, ChangeEvent> rs1 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(20), 1, 5, "Q123"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
ConsumerRecords<String, ChangeEvent> rs2 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 4, 1, "Q234"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
ConsumerRecords<String, ChangeEvent> rs3 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 4, 2, "Q234"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 10, "Q123"), "othertopic", Duration.ofMillis(31)), makeRecord(makeRCEvent(Duration.ofMillis(30), 5, 21, "Q245"), "topictest", Duration.ofMillis(40)));
when(consumer.poll(anyLong())).thenReturn(rs1, rs2, rs3, EMPTY_CHANGES);
Batch batch = poller.firstBatch();
// If all three had good events, all three should be in the batch
assertThat(batch.changes()).hasSize(3).anyMatch(titleRevision("Q123", 10)).anyMatch(titleRevision("Q234", 2)).anyMatch(titleRevision("Q245", 21));
}
use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.
the class KafkaPollerUnitTest method batchSize.
@Test
public void batchSize() throws RetryableException {
KafkaPoller poller = makePoller();
ConsumerRecords<String, ChangeEvent> rs1 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(20), 1, 5, "Q1"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(20), 4, 5, "Q2"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(20), 5, 5, "Q3"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(20), 6, 5, "Q4"), "topictest", Duration.ofMillis(20)));
ConsumerRecords<String, ChangeEvent> rs2 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 5, 10, "Q3"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 20, "Q1"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 100, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 4, 20, "Q2"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 20, "Q1"), "othertopic", Duration.ofMillis(21)));
ConsumerRecords<String, ChangeEvent> rs3 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 5, 100, "Q3"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 1, 200, "Q1"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 100, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 7, 200, "Q5"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(30), 8, 200, "Q6"), "othertopic", Duration.ofMillis(21)));
ConsumerRecords<String, ChangeEvent> rs4 = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(30), 9, 2, "Q7"), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
when(consumer.poll(anyLong())).thenReturn(rs1, rs2, rs3, rs4, EMPTY_CHANGES);
Batch batch = poller.firstBatch();
// The batch should stop as soon as we got over size 5
assertThat(batch.changes()).hasSize(6).anyMatch(titleRevision("Q1", 200)).anyMatch(titleRevision("Q2", 20)).anyMatch(titleRevision("Q3", 100)).anyMatch(titleRevision("Q4", 5)).anyMatch(titleRevision("Q5", 200)).anyMatch(titleRevision("Q6", 200)).noneMatch(title("Q7"));
}
use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.
the class KafkaPollerUnitTest method filterOtherChanges.
@Test
public void filterOtherChanges() throws RetryableException {
ConsumerRecords<String, ChangeEvent> rs = makeRecords(makeRecord(makeRCEvent(Duration.ofMillis(20), 1, 5, "Q123"), "topictest", Duration.ofMillis(20)), makeRecord(makeRCEvent(Duration.ofMillis(30), 2, 2, "Q666", 1, DOMAIN), "othertopic", Duration.ofMillis(21)), makeRecord(makeRCEvent(Duration.ofMillis(25), 3, 10, "Q6666", 0, "acme.wrong"), "topictest", Duration.ofMillis(20)));
Batch batch = getBatchFromRecords(rs);
// There should be only one change, and it should have max revision
assertThat(batch.changes()).hasSize(1).anyMatch(title("Q123"));
}
use of org.wikidata.query.rdf.tool.change.events.ChangeEvent in project wikidata-query-rdf by wikimedia.
the class KafkaPollerUnitTest method writeOffsets.
@Test
public void writeOffsets() throws RetryableException {
// Scenario where all offsets are loaded from both storage and timestamp
Collection<String> topics = ImmutableList.of("topictest", "othertopic", "thirdtopic");
KafkaOffsetsRepository offsetsRepository = mock(KafkaOffsetsRepository.class);
Map<TopicPartition, List<ConsumerRecord<String, ChangeEvent>>> records = new HashMap<>();
records.put(new TopicPartition("topictest", 0), Arrays.asList(new ConsumerRecord<>("topictest", 0, 2L, "1", newChange("Q1")), new ConsumerRecord<>("topictest", 0, 2L, "4", newChange("Q4"))));
records.put(new TopicPartition("othertopic", 0), Arrays.asList(new ConsumerRecord<>("othertopic", 0, 2L, "2", newChange("Q2")), new ConsumerRecord<>("othertopic", 0, 3L, "5", newChange("Q5"))));
records.put(new TopicPartition("thirdtopic", 0), singletonList(new ConsumerRecord<>("thirdtopic", 0, 2L, "3", newChange("Q3"))));
createTopicPartitions(1);
when(offsetsRepository.load(any())).thenReturn(ImmutableMap.of());
when(consumer.poll(anyLong())).thenReturn(new ConsumerRecords<>(records));
ArgumentCaptor<Map<TopicPartition, OffsetAndMetadata>> storeCaptor = ArgumentCaptor.forClass((Class) Map.class);
ArgumentCaptor<Map<TopicPartition, OffsetAndMetadata>> kafkaAsyncStoreCaptor = ArgumentCaptor.forClass((Class) Map.class);
ArgumentCaptor<Map<TopicPartition, OffsetAndMetadata>> kafkaSyncStoreCaptor = ArgumentCaptor.forClass((Class) Map.class);
doNothing().when(offsetsRepository).store(storeCaptor.capture());
doNothing().when(consumer).commitAsync(kafkaAsyncStoreCaptor.capture(), any());
doNothing().when(consumer).commitSync(kafkaSyncStoreCaptor.capture());
KafkaPoller poller = new KafkaPoller(consumer, uris, START_TIME, BATCH_SIZE, topics, offsetsRepository, true, new MetricRegistry());
Batch batch = poller.firstBatch();
poller.done(batch);
// Should be one update query
verify(offsetsRepository, times(1)).store(any());
assertThat(storeCaptor.getValue()).containsEntry(new TopicPartition("topictest", 0), new OffsetAndMetadata(2L)).containsEntry(new TopicPartition("othertopic", 0), new OffsetAndMetadata(3L)).containsEntry(new TopicPartition("thirdtopic", 0), new OffsetAndMetadata(2L));
poller.nextBatch(batch);
assertThat(kafkaAsyncStoreCaptor.getValue()).containsEntry(new TopicPartition("topictest", 0), new OffsetAndMetadata(2L)).containsEntry(new TopicPartition("othertopic", 0), new OffsetAndMetadata(3L)).containsEntry(new TopicPartition("thirdtopic", 0), new OffsetAndMetadata(2L));
poller.done(batch);
// Verify that the last offsets are committed synchronously when closing
poller.close();
assertThat(kafkaSyncStoreCaptor.getValue()).containsEntry(new TopicPartition("topictest", 0), new OffsetAndMetadata(2L)).containsEntry(new TopicPartition("othertopic", 0), new OffsetAndMetadata(3L)).containsEntry(new TopicPartition("thirdtopic", 0), new OffsetAndMetadata(2L));
}
Aggregations