use of org.wikidata.query.rdf.tool.rdf.client.RdfClient in project wikidata-query-rdf by wikimedia.
the class StreamingUpdate method build.
static StreamingUpdaterConsumer build(StreamingUpdateOptions options, MetricRegistry metrics) {
RDFChunkDeserializer deser = new RDFChunkDeserializer(new RDFParserSuppliers(RDFParserRegistry.getInstance()));
KafkaStreamConsumer consumer = KafkaStreamConsumer.build(options.brokers(), options.topic(), options.partition(), options.consumerGroup(), options.batchSize(), deser, parseInitialOffset(options), KafkaStreamConsumerMetricsListener.forRegistry(metrics), options.bufferedInputMessages(), buildFilter(StreamingUpdateOptions.entityFilterPattern(options)));
HttpClient httpClient = buildHttpClient(getHttpProxyHost(), getHttpProxyPort());
Retryer<ContentResponse> retryer = buildHttpClientRetryer();
Duration rdfClientTimeout = RdfRepositoryUpdater.getRdfClientTimeout();
RdfClient rdfClient = new RdfClient(httpClient, StreamingUpdateOptions.sparqlUri(options), retryer, rdfClientTimeout);
UrisScheme uris = UrisSchemeFactory.getURISystem();
return new StreamingUpdaterConsumer(consumer, new RdfRepositoryUpdater(rdfClient, uris), metrics, options.inconsistenciesWarningThreshold());
}
use of org.wikidata.query.rdf.tool.rdf.client.RdfClient in project wikidata-query-rdf by wikimedia.
the class RdfKafkaRepositoryIntegrationTest method readWriteOffsets.
@Test
public void readWriteOffsets() throws Exception {
Uris uris = new Uris(new URI("https://acme.test"), singleton(0L), "/api.php", "/entitydata");
Instant startTime = Instant.ofEpochMilli(BEGIN_DATE);
HttpClient httpClient = buildHttpClient(getHttpProxyHost(), getHttpProxyPort());
RdfClient rdfClient = new RdfClient(httpClient, url("/namespace/wdq/sparql"), buildHttpClientRetryer(), Duration.of(-1, SECONDS));
try {
rdfClient.update("CLEAR ALL");
KafkaOffsetsRepository kafkaOffsetsRepository = new RdfKafkaOffsetsRepository(uris.builder().build(), rdfClient);
Map<TopicPartition, OffsetAndMetadata> offsets = new HashMap<>();
offsets.put(new TopicPartition("topictest", 0), new OffsetAndMetadata(1L));
offsets.put(new TopicPartition("othertopic", 0), new OffsetAndMetadata(2L));
kafkaOffsetsRepository.store(offsets);
Map<TopicPartition, OffsetAndTimestamp> offsetsAndTimestamps = kafkaOffsetsRepository.load(startTime);
assertThat(offsetsAndTimestamps.get(new TopicPartition("topictest", 0)).offset()).isEqualTo(1L);
assertThat(offsetsAndTimestamps.get(new TopicPartition("othertopic", 0)).offset()).isEqualTo(2L);
offsets = new HashMap<>();
offsets.put(new TopicPartition("topictest", 0), new OffsetAndMetadata(3L));
offsets.put(new TopicPartition("othertopic", 0), new OffsetAndMetadata(4L));
kafkaOffsetsRepository.store(offsets);
offsetsAndTimestamps = kafkaOffsetsRepository.load(startTime);
assertThat(offsetsAndTimestamps.get(new TopicPartition("topictest", 0)).offset()).isEqualTo(3L);
assertThat(offsetsAndTimestamps.get(new TopicPartition("othertopic", 0)).offset()).isEqualTo(4L);
} finally {
rdfClient.update("CLEAR ALL");
httpClient.stop();
}
}
use of org.wikidata.query.rdf.tool.rdf.client.RdfClient in project wikidata-query-rdf by wikimedia.
the class Update method initialize.
private static Updater<? extends Change.Batch> initialize(String[] args, Closer closer) throws URISyntaxException {
try {
UpdateOptions options = handleOptions(UpdateOptions.class, args);
MetricRegistry metricRegistry = createMetricRegistry(closer, options.metricDomain());
StreamDumper wikibaseStreamDumper = createStreamDumper(dumpDirPath(options));
WikibaseRepository wikibaseRepository = new WikibaseRepository(UpdateOptions.uris(options), options.constraints(), metricRegistry, wikibaseStreamDumper, UpdateOptions.revisionDuration(options), RDFParserSuppliers.defaultRdfParser());
closer.register(wikibaseRepository);
UrisScheme wikibaseUris = WikibaseOptions.wikibaseUris(options);
URI root = wikibaseRepository.getUris().builder().build();
URI sparqlUri = UpdateOptions.sparqlUri(options);
HttpClient httpClient = buildHttpClient(getHttpProxyHost(), getHttpProxyPort());
closer.register(wrapHttpClient(httpClient));
Retryer<ContentResponse> retryer = buildHttpClientRetryer();
Duration rdfClientTimeout = getRdfClientTimeout();
RdfClient rdfClient = new RdfClient(httpClient, sparqlUri, retryer, rdfClientTimeout);
RdfRepository rdfRepository = new RdfRepository(wikibaseUris, rdfClient, MAX_FORM_CONTENT_SIZE);
Instant startTime = getStartTime(startInstant(options), rdfRepository, options.init());
Change.Source<? extends Change.Batch> changeSource = buildChangeSource(options, startTime, wikibaseRepository, rdfClient, root, metricRegistry);
Munger munger = mungerFromOptions(options);
ExecutorService updaterExecutorService = createUpdaterExecutorService(options.threadCount());
Updater<? extends Change.Batch> updater = createUpdater(wikibaseRepository, wikibaseUris, rdfRepository, changeSource, munger, updaterExecutorService, options.importAsync(), options.pollDelay(), options.verify(), metricRegistry);
closer.register(updater);
return updater;
} catch (Exception e) {
log.error("Error during initialization.", e);
throw e;
}
}
use of org.wikidata.query.rdf.tool.rdf.client.RdfClient in project wikidata-query-rdf by wikimedia.
the class RdfRepositoryUnitTest method batchUpdate.
@Test
public void batchUpdate() {
RdfClient mockClient = mock(RdfClient.class);
// 1.5M size means ~4k statements or 250K statement size max
long maxPostSize = 1572864L;
CollectedUpdateMetrics collectedUpdateMetrics = new CollectedUpdateMetrics();
collectedUpdateMetrics.setMutationCount(1);
collectedUpdateMetrics.merge(MultiSyncStep.INSERT_NEW_DATA, UpdateMetrics.builder().build());
when(mockClient.update(any(String.class), any(UpdateMetricsResponseHandler.class))).thenReturn(collectedUpdateMetrics);
RdfRepository repo = new RdfRepository(uris, mockClient, maxPostSize);
// 6000 statements - should go over the limit
Change change1 = new Change("Q1", 1, Instant.EPOCH, 1);
StatementBuilder sb = new StatementBuilder("Q1");
for (int i = 0; i < 6000; i++) {
sb.withPredicateObject(RDFS.LABEL, new LiteralImpl("some item " + i));
}
change1.setStatements(sb.build());
// One statement with 300K data - should go over the limit
Change change2 = new Change("Q2", 1, Instant.EPOCH, 1);
List<Statement> statements2 = new StatementBuilder("Q2").withPredicateObject(RDFS.LABEL, new LiteralImpl(randomizer.randomAsciiOfLength(300 * 1024))).build();
change2.setStatements(statements2);
// Just one statement - this will be separated anyway
Change change3 = new Change("Q3", 1, Instant.EPOCH, 1);
List<Statement> statements3 = new StatementBuilder("Q3").withPredicateObject(RDFS.LABEL, new LiteralImpl("third item")).build();
change3.setStatements(statements3);
List<Change> changes = ImmutableList.of(change1, change2, change3);
int count = repo.syncFromChanges(changes, false).getMutationCount();
assertThat(count).isEqualTo(3);
// We should get 3 calls to update
verify(mockClient, times(3)).update(any(), any());
}
Aggregations