use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.
the class WikibaseRepository method fetchRdfForEntity.
/**
* Fetch the RDF for some entity.
*
* @throws RetryableException thrown if there is an error communicating with
* wikibase
*/
public Collection<Statement> fetchRdfForEntity(String entityId) throws RetryableException {
// TODO handle ?flavor=dump or whatever parameters we need
URI uri = uris.rdf(entityId);
long start = System.currentTimeMillis();
log.debug("Fetching rdf from {}", uri);
RDFParser parser = Rio.createParser(RDFFormat.TURTLE);
StatementCollector collector = new StatementCollector();
parser.setRDFHandler(new NormalizingRdfHandler(collector));
HttpGet request = new HttpGet(uri);
request.setConfig(configWithTimeout);
try {
try (CloseableHttpResponse response = client.execute(request)) {
if (response.getStatusLine().getStatusCode() == 404) {
// A delete/nonexistent page
return Collections.emptyList();
}
if (response.getStatusLine().getStatusCode() >= 300) {
throw new ContainedException("Unexpected status code fetching RDF for " + uri + ": " + response.getStatusLine().getStatusCode());
}
parser.parse(new InputStreamReader(response.getEntity().getContent(), Charsets.UTF_8), uri.toString());
}
} catch (UnknownHostException | SocketException | SSLHandshakeException e) {
// We want to bail on this, since it happens to be sticky for some reason
throw new RuntimeException(e);
} catch (IOException e) {
throw new RetryableException("Error fetching RDF for " + uri, e);
} catch (RDFParseException | RDFHandlerException e) {
throw new ContainedException("RDF parsing error for " + uri, e);
}
log.debug("Done in {} ms", System.currentTimeMillis() - start);
return collector.getStatements();
}
use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.
the class KafkaPollerUnitTest method topicSubscribe.
@Test
public void topicSubscribe() throws RetryableException {
Instant startTime = Instant.ofEpochMilli(BEGIN_DATE);
Collection<String> topics = ImmutableList.of("topictest", "othertopic");
ImmutableList<PartitionInfo> twoParts = ImmutableList.of(makePartitionInfo(0), makePartitionInfo(1));
ArgumentCaptor<String> partitionArgs = ArgumentCaptor.forClass(String.class);
when(consumer.partitionsFor(partitionArgs.capture())).thenReturn(twoParts);
ArgumentCaptor<Collection<TopicPartition>> assignArgs = ArgumentCaptor.forClass((Class) Collection.class);
doNothing().when(consumer).assign(assignArgs.capture());
when(consumer.offsetsForTimes(any())).thenAnswer(i -> {
Map<TopicPartition, Long> map = i.getArgumentAt(0, Map.class);
// Check that timestamps are OK
map.forEach((k, v) -> assertThat(v, equalTo(BEGIN_DATE)));
Map<TopicPartition, OffsetAndTimestamp> out = Maps.newHashMapWithExpectedSize(map.size());
// Make offset 1 for first partition and nothing for second
map.forEach((k, v) -> out.put(k, k.partition() == 0 ? new OffsetAndTimestamp(1000, v) : null));
// Using forEach here because collect() can't handle nulls
return out;
});
ArgumentCaptor<TopicPartition> seekArgs = ArgumentCaptor.forClass(TopicPartition.class);
doNothing().when(consumer).seek(seekArgs.capture(), eq(1000L));
ArgumentCaptor<Collection<TopicPartition>> seekBeginningArgs = ArgumentCaptor.forClass((Class) Collection.class);
doNothing().when(consumer).seekToEnd(seekBeginningArgs.capture());
when(consumer.poll(anyLong())).thenReturn(EMPTY_CHANGES);
KafkaPoller poller = new KafkaPoller(consumer, uris, startTime, BATCH_SIZE, topics);
Batch batch = poller.firstBatch();
// We get partitions for both topics
verify(consumer, times(2)).partitionsFor(any());
assertThat(partitionArgs.getAllValues(), contains("topictest", "othertopic"));
// We assign to 4 topics - 2 topics x 2 partitions
verify(consumer, times(1)).assign(any());
assertThat(assignArgs.getValue(), hasSize(4));
// Calling seek on both topics, partition 0
verify(consumer, times(2)).seek(any(), anyLong());
assertThat(seekArgs.getAllValues().stream().map(p -> p.topic()).toArray(), arrayContainingInAnyOrder("topictest", "othertopic"));
Collection<String> sTopics = seekArgs.getAllValues().stream().map(tp -> tp.topic()).collect(Collectors.toList());
assertThat(sTopics, hasSize(2));
assertThat(sTopics, containsInAnyOrder("topictest", "othertopic"));
Collection<Integer> sPartitions = seekArgs.getAllValues().stream().map(tp -> tp.partition()).distinct().collect(Collectors.toList());
assertThat(sPartitions, hasSize(1));
assertThat(sPartitions, contains(0));
// Calling seekToEnd on both topics, partition 1
verify(consumer, times(2)).seekToEnd(any());
Collection<String> sbTopics = seekBeginningArgs.getAllValues().stream().flatMap(c -> c.stream()).map(tp -> tp.topic()).collect(Collectors.toList());
assertThat(sbTopics, hasSize(2));
assertThat(sbTopics, contains("topictest", "othertopic"));
Collection<Integer> sbPartitions = seekBeginningArgs.getAllValues().stream().flatMap(c -> c.stream()).map(tp -> tp.partition()).distinct().collect(Collectors.toList());
assertThat(sbPartitions, hasSize(1));
assertThat(sbPartitions, contains(1));
verify(consumer, times(1)).offsetsForTimes(any());
}
use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.
the class Updater method run.
@Override
public void run() {
B batch = null;
do {
try {
batch = changeSource.firstBatch();
} catch (RetryableException e) {
log.warn("Retryable error fetching first batch. Retrying.", e);
}
} while (batch == null);
log.debug("{} changes in batch", batch.changes().size());
Instant oldDate = null;
while (!currentThread().isInterrupted()) {
try {
handleChanges(batch.changes());
Instant leftOffDate = batch.leftOffDate();
if (leftOffDate != null) {
/*
* Back one second because the resolution on our poll isn't
* super good and because its not big deal to recheck if we
* have some updates.
*/
leftOffDate = leftOffDate.minusSeconds(1);
// Do not update repo with the same date
if (oldDate == null || !oldDate.equals(leftOffDate)) {
syncDate(leftOffDate);
oldDate = leftOffDate;
}
}
// TODO wrap all retry-able exceptions in a special exception
batchAdvanced.mark(batch.advanced());
log.info("Polled up to {} at {} updates per second and {} {} per second", batch.leftOffHuman(), meterReport(updateMeter), meterReport(batchAdvanced), batch.advancedUnits());
if (batch.last()) {
return;
}
batch = nextBatch(batch);
} catch (InterruptedException e) {
currentThread().interrupt();
} catch (ExecutionException e) {
log.error("Syncing encountered a fatal exception", e);
break;
}
}
}
use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.
the class WikibaseRepository method fetchRecentChanges.
/**
* Fetch recent changes starting from nextStartTime or continuing from
* lastContinue depending on the contents of lastContinue way to use
* MediaWiki. See RecentChangesPoller for how to poll these. Or just use it.
*
* @param nextStartTime if lastContinue is null then this is the start time
* of the query
* @param batchSize the number of recent changes to fetch
* @param lastContinue Continuation object from last batch, or null.
* @return result of query
* @throws RetryableException thrown if there is an error communicating with
* wikibase
*/
public RecentChangeResponse fetchRecentChanges(Instant nextStartTime, Continue lastContinue, int batchSize) throws RetryableException {
URI uri = uris.recentChanges(nextStartTime, lastContinue, batchSize);
log.debug("Polling for changes from {}", uri);
HttpGet request = new HttpGet(uri);
request.setConfig(configWithTimeout);
try {
return checkApi(getJson(request, RecentChangeResponse.class));
} catch (UnknownHostException | SocketException e) {
// We want to bail on this, since it happens to be sticky for some reason
throw new RuntimeException(e);
} catch (JsonParseException | JsonMappingException e) {
// An invalid response will probably not fix itself with a retry, so let's bail
throw new RuntimeException(e);
} catch (IOException e) {
throw new RetryableException("Error fetching recent changes", e);
}
}
Aggregations