use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.
the class PatchAccumulator method accumulateReconciliation.
private void accumulateReconciliation(List<MutationEventData> sequence) {
checkPositionIndex(0, sequence.size(), "Received empty sequence");
MutationEventData head = sequence.get(0);
Optional<MutationEventData> inconsistentBlock = sequence.stream().filter(m -> {
if (!head.getEntity().equals(m.getEntity())) {
return true;
} else if (!m.getMeta().requestId().equals(head.getMeta().requestId())) {
return true;
} else
return !head.getOperation().equals(m.getOperation());
}).findFirst();
if (inconsistentBlock.isPresent()) {
throw new IllegalArgumentException("Inconsistent sequence of events: " + inconsistentBlock.get() + " does not belong to " + head);
}
List<Statement> allStmts = sequence.stream().map(DiffEventData.class::cast).map(DiffEventData::getRdfAddedData).flatMap(c -> deserChunk(c).stream()).collect(toList());
reconciliations.put(head.getEntity(), allStmts);
// Drop patch data from this entity since we are reconciling it we will reset all that anyways
removeDataFromEntity(head.getEntity());
totalAccumulated += allStmts.size();
}
use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.
the class KafkaStreamConsumerUnitTest method test_commit_offsets.
@Test
public void test_commit_offsets() {
TopicPartition topicPartition = new TopicPartition("topic", 0);
MutationEventData firstEvent = genEvent("Q1", 0, uris("uri:1"), uris(), uris(), uris(), Instant.EPOCH).get(0);
MutationEventData secondEvent = genEvent("Q1", 1, uris("uri:2"), uris(), uris(), uris(), Instant.EPOCH).get(0);
MutationEventData thirdEvent = genEvent("Q1", 2, uris("uri:3"), uris(), uris(), uris(), Instant.EPOCH).get(0);
Map<TopicPartition, OffsetAndMetadata> firstOffsets = Collections.singletonMap(topicPartition, new OffsetAndMetadata(1));
Map<TopicPartition, OffsetAndMetadata> secondOffsets = Collections.singletonMap(topicPartition, new OffsetAndMetadata(2));
Map<TopicPartition, OffsetAndMetadata> thirdOffsets = Collections.singletonMap(topicPartition, new OffsetAndMetadata(3));
// we want real instances as we use AtomicReference
when(consumer.poll(any())).thenReturn(new ConsumerRecords<>(singletonMap(topicPartition, singletonList(new ConsumerRecord<>(TESTED_STREAM, 0, 1, null, firstEvent)))), new ConsumerRecords<>(singletonMap(topicPartition, singletonList(new ConsumerRecord<>(TESTED_STREAM, 0, 2, null, secondEvent)))), new ConsumerRecords<>(singletonMap(topicPartition, singletonList(new ConsumerRecord<>(TESTED_STREAM, 0, 3, null, thirdEvent)))), new ConsumerRecords<>(emptyMap()));
ArgumentCaptor<OffsetCommitCallback> callback = ArgumentCaptor.forClass(OffsetCommitCallback.class);
KafkaStreamConsumer streamConsumer = new KafkaStreamConsumer(consumer, topicPartition, chunkDeser, 1, KafkaStreamConsumerMetricsListener.forRegistry(new MetricRegistry()), m -> true);
StreamConsumer.Batch b = streamConsumer.poll(Duration.ofMillis(10));
streamConsumer.acknowledge();
verify(consumer, times(1)).commitAsync(eq(firstOffsets), callback.capture());
streamConsumer.poll(Duration.ofMillis(10));
// fail the first commit and verify that we retry
callback.getValue().onComplete(firstOffsets, new Exception("simulated failure"));
verify(consumer, times(2)).commitAsync(eq(firstOffsets), callback.capture());
streamConsumer.acknowledge();
// fail the first commit a second time after we are ready to commit the second batch
// and verify that we do not retry
callback.getValue().onComplete(firstOffsets, new Exception("simulated failure"));
verify(consumer, times(2)).commitAsync(eq(firstOffsets), callback.capture());
// also verify that we send commitAsync for the second batch
verify(consumer, times(1)).commitAsync(eq(secondOffsets), callback.capture());
// fail the second commit and verify that we retry
callback.getValue().onComplete(secondOffsets, new Exception("Simulated failure"));
verify(consumer, times(2)).commitAsync(eq(secondOffsets), callback.capture());
// the retry succeeded
callback.getValue().onComplete(secondOffsets, null);
streamConsumer.poll(Duration.ofMillis(10));
streamConsumer.acknowledge();
verify(consumer, times(1)).commitAsync(eq(thirdOffsets), callback.capture());
streamConsumer.close();
// verify that we commit synchronously since we did not receive yet the ack of our async commit
verify(consumer, times(1)).commitSync(eq(thirdOffsets));
}
use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.
the class UpdatePatchAccumulatorUnitTest method test_leak_data_from_accumulator.
@Test
public void test_leak_data_from_accumulator() {
MutationEventDataGenerator eventGenerator = new MutationEventDataGenerator(serializer, RDFFormat.TURTLE.getDefaultMIMEType(), 300);
PatchAccumulator accumulator = new PatchAccumulator(deserializer);
List<MutationEventData> events = eventGenerator.diffEvent(metaGenerator("Q1"), "Q1", 1, Instant.EPOCH, singletonList(stmt("uri:added-Q1")), singletonList(stmt("uri:deleted-Q1")), asList(stmt("uri:linked-shared"), stmt("uri:")), singletonList(stmt("uri:unlinked-shared")));
events.forEach(accumulator::accumulate);
ConsumerPatch expectedPatch = accumulator.asPatch();
List<MutationEventData> events2 = eventGenerator.diffEvent(metaGenerator("Q2"), "Q2", 1, Instant.EPOCH, asList(stmt("uri:added-Q2"), stmt("uri:added-Q1")), singletonList(stmt("uri:deleted-Q1")), asList(stmt("uri:linked-shared"), stmt("uri:")), singletonList(stmt("uri:unlinked-shared")));
assertThatThrownBy(() -> events2.forEach(accumulator::accumulate)).isInstanceOf(IllegalArgumentException.class);
ConsumerPatch secondPatch = accumulator.asPatch();
assertThat(secondPatch).isEqualTo(expectedPatch);
}
use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.
the class KafkaStreamConsumer method build.
public static KafkaStreamConsumer build(String brokers, String topic, int partition, String consumerId, int maxBatchLength, RDFChunkDeserializer deser, @Nullable BiConsumer<Consumer<String, MutationEventData>, TopicPartition> offsetReset, KafkaStreamConsumerMetricsListener metrics, int bufferedInputMessages, Predicate<MutationEventData> filter) {
Map<String, Object> props = new HashMap<>();
props.put("bootstrap.servers", brokers);
props.put("group.id", consumerId);
props.put("max.poll.interval.ms", "600000");
props.put("enable.auto.commit", "false");
props.put("isolation.level", "read_committed");
props.put("max.poll.records", bufferedInputMessages);
if (offsetReset == null) {
props.put("auto.offset.reset", "earliest");
} else {
props.put("auto.offset.reset", "none");
}
// 10 very large messages (120k)
props.put("max.partition.fetch.bytes", 10 * 120 * 1024);
KafkaConsumer<String, MutationEventData> consumer = new KafkaConsumer<>(props, new StringDeserializer(), new JsonDeserializer<>(singletonMap(topic, MutationEventData.class)));
TopicPartition topicPartition = new TopicPartition(topic, partition);
consumer.assign(singleton(new TopicPartition(topic, partition)));
try {
// Fetching position will fail if no offsets are positioned yet for this consumerId.
// This pattern only works because we know that we have a single consumer per blazegraph host.
// If it was a group of consumers like it's usually the case this strategy would make no sense.
consumer.position(topicPartition);
} catch (InvalidOffsetException ioe) {
if (offsetReset == null) {
throw new IllegalStateException("Failed to find earliest offsets for [" + topicPartition + "]", ioe);
}
offsetReset.accept(consumer, topicPartition);
}
return new KafkaStreamConsumer(consumer, topicPartition, deser, maxBatchLength, metrics, filter);
}
use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.
the class PatchAccumulator method accumulateDiff.
private void accumulateDiff(List<MutationEventData> sequence) {
MutationEventData head = sequence.get(0);
List<Statement> added = new ArrayList<>();
List<Statement> removed = new ArrayList<>();
List<Statement> linkedShared = new ArrayList<>();
List<Statement> unlinkedShared = new ArrayList<>();
for (MutationEventData data : sequence) {
if (!head.getClass().equals(data.getClass())) {
throw new IllegalArgumentException("Inconsistent chunks provided, head class " + head.getClass() + " does not match " + data.getClass());
}
if (!head.getMeta().requestId().equals(data.getMeta().requestId())) {
throw new IllegalArgumentException("Inconsistent chunks provided, head requestId " + head.getMeta().requestId() + " does not match " + data.getMeta().requestId());
}
DiffEventData diff = (DiffEventData) data;
if (diff.getRdfAddedData() != null) {
added.addAll(deserChunk(diff.getRdfAddedData()));
}
if (diff.getRdfDeletedData() != null) {
removed.addAll(deserChunk(diff.getRdfDeletedData()));
}
if (diff.getRdfLinkedSharedData() != null) {
linkedShared.addAll(deserChunk(diff.getRdfLinkedSharedData()));
}
if (diff.getRdfUnlinkedSharedData() != null) {
unlinkedShared.addAll(deserChunk(diff.getRdfUnlinkedSharedData()));
}
}
Patch patch = SiteLinksReclassification.reclassify(new Patch(added, linkedShared, removed, unlinkedShared));
accumulate(head.getEntity(), patch.getAdded(), patch.getRemoved(), patch.getLinkedSharedElements(), patch.getUnlinkedSharedElements());
}
Aggregations