Search in sources :

Example 1 with MutationEventData

use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.

the class PatchAccumulator method accumulateReconciliation.

private void accumulateReconciliation(List<MutationEventData> sequence) {
    checkPositionIndex(0, sequence.size(), "Received empty sequence");
    MutationEventData head = sequence.get(0);
    Optional<MutationEventData> inconsistentBlock = sequence.stream().filter(m -> {
        if (!head.getEntity().equals(m.getEntity())) {
            return true;
        } else if (!m.getMeta().requestId().equals(head.getMeta().requestId())) {
            return true;
        } else
            return !head.getOperation().equals(m.getOperation());
    }).findFirst();
    if (inconsistentBlock.isPresent()) {
        throw new IllegalArgumentException("Inconsistent sequence of events: " + inconsistentBlock.get() + " does not belong to " + head);
    }
    List<Statement> allStmts = sequence.stream().map(DiffEventData.class::cast).map(DiffEventData::getRdfAddedData).flatMap(c -> deserChunk(c).stream()).collect(toList());
    reconciliations.put(head.getEntity(), allStmts);
    // Drop patch data from this entity since we are reconciling it we will reset all that anyways
    removeDataFromEntity(head.getEntity());
    totalAccumulated += allStmts.size();
}
Also used : Statement(org.openrdf.model.Statement) Collections.unmodifiableList(java.util.Collections.unmodifiableList) Getter(lombok.Getter) DiffEventData(org.wikidata.query.rdf.updater.DiffEventData) HashMap(java.util.HashMap) Collections.singletonList(java.util.Collections.singletonList) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) RDFDataChunk(org.wikidata.query.rdf.updater.RDFDataChunk) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Collectors.toMap(java.util.stream.Collectors.toMap) RECONCILE_OPERATION(org.wikidata.query.rdf.updater.MutationEventData.RECONCILE_OPERATION) MutationEventData(org.wikidata.query.rdf.updater.MutationEventData) Map(java.util.Map) ConsumerPatch(org.wikidata.query.rdf.tool.rdf.ConsumerPatch) DIFF_OPERATION(org.wikidata.query.rdf.updater.MutationEventData.DIFF_OPERATION) Preconditions.checkPositionIndex(com.google.common.base.Preconditions.checkPositionIndex) Collection(java.util.Collection) Set(java.util.Set) SiteLinksReclassification(org.wikidata.query.rdf.tool.rdf.SiteLinksReclassification) Patch(org.wikidata.query.rdf.tool.rdf.Patch) Sets(com.google.common.collect.Sets) RDFChunkDeserializer(org.wikidata.query.rdf.updater.RDFChunkDeserializer) Collectors.toList(java.util.stream.Collectors.toList) List(java.util.List) IMPORT_OPERATION(org.wikidata.query.rdf.updater.MutationEventData.IMPORT_OPERATION) Function.identity(java.util.function.Function.identity) Collections.unmodifiableMap(java.util.Collections.unmodifiableMap) Optional(java.util.Optional) DELETE_OPERATION(org.wikidata.query.rdf.updater.MutationEventData.DELETE_OPERATION) NotThreadSafe(javax.annotation.concurrent.NotThreadSafe) Statement(org.openrdf.model.Statement) DiffEventData(org.wikidata.query.rdf.updater.DiffEventData) MutationEventData(org.wikidata.query.rdf.updater.MutationEventData)

Example 2 with MutationEventData

use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.

the class KafkaStreamConsumerUnitTest method test_commit_offsets.

@Test
public void test_commit_offsets() {
    TopicPartition topicPartition = new TopicPartition("topic", 0);
    MutationEventData firstEvent = genEvent("Q1", 0, uris("uri:1"), uris(), uris(), uris(), Instant.EPOCH).get(0);
    MutationEventData secondEvent = genEvent("Q1", 1, uris("uri:2"), uris(), uris(), uris(), Instant.EPOCH).get(0);
    MutationEventData thirdEvent = genEvent("Q1", 2, uris("uri:3"), uris(), uris(), uris(), Instant.EPOCH).get(0);
    Map<TopicPartition, OffsetAndMetadata> firstOffsets = Collections.singletonMap(topicPartition, new OffsetAndMetadata(1));
    Map<TopicPartition, OffsetAndMetadata> secondOffsets = Collections.singletonMap(topicPartition, new OffsetAndMetadata(2));
    Map<TopicPartition, OffsetAndMetadata> thirdOffsets = Collections.singletonMap(topicPartition, new OffsetAndMetadata(3));
    // we want real instances as we use AtomicReference
    when(consumer.poll(any())).thenReturn(new ConsumerRecords<>(singletonMap(topicPartition, singletonList(new ConsumerRecord<>(TESTED_STREAM, 0, 1, null, firstEvent)))), new ConsumerRecords<>(singletonMap(topicPartition, singletonList(new ConsumerRecord<>(TESTED_STREAM, 0, 2, null, secondEvent)))), new ConsumerRecords<>(singletonMap(topicPartition, singletonList(new ConsumerRecord<>(TESTED_STREAM, 0, 3, null, thirdEvent)))), new ConsumerRecords<>(emptyMap()));
    ArgumentCaptor<OffsetCommitCallback> callback = ArgumentCaptor.forClass(OffsetCommitCallback.class);
    KafkaStreamConsumer streamConsumer = new KafkaStreamConsumer(consumer, topicPartition, chunkDeser, 1, KafkaStreamConsumerMetricsListener.forRegistry(new MetricRegistry()), m -> true);
    StreamConsumer.Batch b = streamConsumer.poll(Duration.ofMillis(10));
    streamConsumer.acknowledge();
    verify(consumer, times(1)).commitAsync(eq(firstOffsets), callback.capture());
    streamConsumer.poll(Duration.ofMillis(10));
    // fail the first commit and verify that we retry
    callback.getValue().onComplete(firstOffsets, new Exception("simulated failure"));
    verify(consumer, times(2)).commitAsync(eq(firstOffsets), callback.capture());
    streamConsumer.acknowledge();
    // fail the first commit a second time after we are ready to commit the second batch
    // and verify that we do not retry
    callback.getValue().onComplete(firstOffsets, new Exception("simulated failure"));
    verify(consumer, times(2)).commitAsync(eq(firstOffsets), callback.capture());
    // also verify that we send commitAsync for the second batch
    verify(consumer, times(1)).commitAsync(eq(secondOffsets), callback.capture());
    // fail the second commit and verify that we retry
    callback.getValue().onComplete(secondOffsets, new Exception("Simulated failure"));
    verify(consumer, times(2)).commitAsync(eq(secondOffsets), callback.capture());
    // the retry succeeded
    callback.getValue().onComplete(secondOffsets, null);
    streamConsumer.poll(Duration.ofMillis(10));
    streamConsumer.acknowledge();
    verify(consumer, times(1)).commitAsync(eq(thirdOffsets), callback.capture());
    streamConsumer.close();
    // verify that we commit synchronously since we did not receive yet the ack of our async commit
    verify(consumer, times(1)).commitSync(eq(thirdOffsets));
}
Also used : MetricRegistry(com.codahale.metrics.MetricRegistry) MutationEventData(org.wikidata.query.rdf.updater.MutationEventData) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) TopicPartition(org.apache.kafka.common.TopicPartition) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) OffsetCommitCallback(org.apache.kafka.clients.consumer.OffsetCommitCallback) Test(org.junit.Test)

Example 3 with MutationEventData

use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.

the class UpdatePatchAccumulatorUnitTest method test_leak_data_from_accumulator.

@Test
public void test_leak_data_from_accumulator() {
    MutationEventDataGenerator eventGenerator = new MutationEventDataGenerator(serializer, RDFFormat.TURTLE.getDefaultMIMEType(), 300);
    PatchAccumulator accumulator = new PatchAccumulator(deserializer);
    List<MutationEventData> events = eventGenerator.diffEvent(metaGenerator("Q1"), "Q1", 1, Instant.EPOCH, singletonList(stmt("uri:added-Q1")), singletonList(stmt("uri:deleted-Q1")), asList(stmt("uri:linked-shared"), stmt("uri:")), singletonList(stmt("uri:unlinked-shared")));
    events.forEach(accumulator::accumulate);
    ConsumerPatch expectedPatch = accumulator.asPatch();
    List<MutationEventData> events2 = eventGenerator.diffEvent(metaGenerator("Q2"), "Q2", 1, Instant.EPOCH, asList(stmt("uri:added-Q2"), stmt("uri:added-Q1")), singletonList(stmt("uri:deleted-Q1")), asList(stmt("uri:linked-shared"), stmt("uri:")), singletonList(stmt("uri:unlinked-shared")));
    assertThatThrownBy(() -> events2.forEach(accumulator::accumulate)).isInstanceOf(IllegalArgumentException.class);
    ConsumerPatch secondPatch = accumulator.asPatch();
    assertThat(secondPatch).isEqualTo(expectedPatch);
}
Also used : MutationEventDataGenerator(org.wikidata.query.rdf.updater.MutationEventDataGenerator) MutationEventData(org.wikidata.query.rdf.updater.MutationEventData) ConsumerPatch(org.wikidata.query.rdf.tool.rdf.ConsumerPatch) Test(org.junit.Test)

Example 4 with MutationEventData

use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.

the class KafkaStreamConsumer method build.

public static KafkaStreamConsumer build(String brokers, String topic, int partition, String consumerId, int maxBatchLength, RDFChunkDeserializer deser, @Nullable BiConsumer<Consumer<String, MutationEventData>, TopicPartition> offsetReset, KafkaStreamConsumerMetricsListener metrics, int bufferedInputMessages, Predicate<MutationEventData> filter) {
    Map<String, Object> props = new HashMap<>();
    props.put("bootstrap.servers", brokers);
    props.put("group.id", consumerId);
    props.put("max.poll.interval.ms", "600000");
    props.put("enable.auto.commit", "false");
    props.put("isolation.level", "read_committed");
    props.put("max.poll.records", bufferedInputMessages);
    if (offsetReset == null) {
        props.put("auto.offset.reset", "earliest");
    } else {
        props.put("auto.offset.reset", "none");
    }
    // 10 very large messages (120k)
    props.put("max.partition.fetch.bytes", 10 * 120 * 1024);
    KafkaConsumer<String, MutationEventData> consumer = new KafkaConsumer<>(props, new StringDeserializer(), new JsonDeserializer<>(singletonMap(topic, MutationEventData.class)));
    TopicPartition topicPartition = new TopicPartition(topic, partition);
    consumer.assign(singleton(new TopicPartition(topic, partition)));
    try {
        // Fetching position will fail if no offsets are positioned yet for this consumerId.
        // This pattern only works because we know that we have a single consumer per blazegraph host.
        // If it was a group of consumers like it's usually the case this strategy would make no sense.
        consumer.position(topicPartition);
    } catch (InvalidOffsetException ioe) {
        if (offsetReset == null) {
            throw new IllegalStateException("Failed to find earliest offsets for [" + topicPartition + "]", ioe);
        }
        offsetReset.accept(consumer, topicPartition);
    }
    return new KafkaStreamConsumer(consumer, topicPartition, deser, maxBatchLength, metrics, filter);
}
Also used : HashMap(java.util.HashMap) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) InvalidOffsetException(org.apache.kafka.clients.consumer.InvalidOffsetException) MutationEventData(org.wikidata.query.rdf.updater.MutationEventData) TopicPartition(org.apache.kafka.common.TopicPartition)

Example 5 with MutationEventData

use of org.wikidata.query.rdf.updater.MutationEventData in project wikidata-query-rdf by wikimedia.

the class PatchAccumulator method accumulateDiff.

private void accumulateDiff(List<MutationEventData> sequence) {
    MutationEventData head = sequence.get(0);
    List<Statement> added = new ArrayList<>();
    List<Statement> removed = new ArrayList<>();
    List<Statement> linkedShared = new ArrayList<>();
    List<Statement> unlinkedShared = new ArrayList<>();
    for (MutationEventData data : sequence) {
        if (!head.getClass().equals(data.getClass())) {
            throw new IllegalArgumentException("Inconsistent chunks provided, head class " + head.getClass() + " does not match " + data.getClass());
        }
        if (!head.getMeta().requestId().equals(data.getMeta().requestId())) {
            throw new IllegalArgumentException("Inconsistent chunks provided, head requestId " + head.getMeta().requestId() + " does not match " + data.getMeta().requestId());
        }
        DiffEventData diff = (DiffEventData) data;
        if (diff.getRdfAddedData() != null) {
            added.addAll(deserChunk(diff.getRdfAddedData()));
        }
        if (diff.getRdfDeletedData() != null) {
            removed.addAll(deserChunk(diff.getRdfDeletedData()));
        }
        if (diff.getRdfLinkedSharedData() != null) {
            linkedShared.addAll(deserChunk(diff.getRdfLinkedSharedData()));
        }
        if (diff.getRdfUnlinkedSharedData() != null) {
            unlinkedShared.addAll(deserChunk(diff.getRdfUnlinkedSharedData()));
        }
    }
    Patch patch = SiteLinksReclassification.reclassify(new Patch(added, linkedShared, removed, unlinkedShared));
    accumulate(head.getEntity(), patch.getAdded(), patch.getRemoved(), patch.getLinkedSharedElements(), patch.getUnlinkedSharedElements());
}
Also used : Statement(org.openrdf.model.Statement) ArrayList(java.util.ArrayList) DiffEventData(org.wikidata.query.rdf.updater.DiffEventData) ConsumerPatch(org.wikidata.query.rdf.tool.rdf.ConsumerPatch) Patch(org.wikidata.query.rdf.tool.rdf.Patch) MutationEventData(org.wikidata.query.rdf.updater.MutationEventData)

Aggregations

MutationEventData (org.wikidata.query.rdf.updater.MutationEventData)10 Test (org.junit.Test)6 TopicPartition (org.apache.kafka.common.TopicPartition)5 DiffEventData (org.wikidata.query.rdf.updater.DiffEventData)5 MetricRegistry (com.codahale.metrics.MetricRegistry)4 ConsumerRecord (org.apache.kafka.clients.consumer.ConsumerRecord)4 ArrayList (java.util.ArrayList)3 Statement (org.openrdf.model.Statement)3 ConsumerPatch (org.wikidata.query.rdf.tool.rdf.ConsumerPatch)3 Duration (java.time.Duration)2 Instant (java.time.Instant)2 Collection (java.util.Collection)2 Collections.singletonList (java.util.Collections.singletonList)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors.toList (java.util.stream.Collectors.toList)2 KafkaConsumer (org.apache.kafka.clients.consumer.KafkaConsumer)2