Search in sources :

Example 1 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class Updater method handleChanges.

/**
 * Handle the changes in a batch.
 *
 * @throws InterruptedException if the process is interrupted while waiting
 *             on changes to sync
 * @throws ExecutionException if there is an error syncing any of the
 *             changes
 */
protected void handleChanges(Iterable<Change> changes) throws InterruptedException, ExecutionException {
    Set<Change> trueChanges = getRevisionUpdates(changes);
    long start = System.currentTimeMillis();
    List<Future<Change>> futureChanges = new ArrayList<>();
    for (Change change : trueChanges) {
        futureChanges.add(executor.submit(() -> {
            while (true) {
                try {
                    handleChange(change);
                    return change;
                } catch (RetryableException e) {
                    log.warn("Retryable error syncing.  Retrying.", e);
                } catch (ContainedException e) {
                    log.warn("Contained error syncing.  Giving up on {}", change.entityId(), e);
                    throw e;
                }
            }
        }));
    }
    List<Change> processedChanges = new ArrayList<>();
    for (Future<Change> f : futureChanges) {
        try {
            processedChanges.add(f.get());
        } catch (ExecutionException ignore) {
        // failure has already been logged
        }
    }
    log.debug("Preparing update data took {} ms, have {} changes", System.currentTimeMillis() - start, processedChanges.size());
    rdfRepository.syncFromChanges(processedChanges, verify);
    updateMeter.mark(processedChanges.size());
}
Also used : RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) Change(org.wikidata.query.rdf.tool.change.Change) ContainedException(org.wikidata.query.rdf.tool.exception.ContainedException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class KafkaPoller method fetch.

/**
 * Fetch changes from Kafka.
 * @param lastNextStartTime where last fetch ended up.
 * @return Set of changes.
 * @throws RetryableException
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch fetch(Instant lastNextStartTime) throws RetryableException {
    Map<String, Change> changesByTitle = new LinkedHashMap<>();
    ConsumerRecords<String, ChangeEvent> records;
    Map<String, Instant> timesByTopic = newHashMapWithExpectedSize(topics.size());
    while (true) {
        try {
            // TODO: make timeout configurable? Wait for a bit so we catch bursts of messages?
            records = consumer.poll(1000);
        } catch (InterruptException | WakeupException e) {
            throw new RetryableException("Error fetching recent changes", e);
        }
        int count = records.count();
        log.info("Fetched {} records from Kafka", count);
        if (count == 0) {
            // If we got nothing from Kafka, get out of the loop and return what we have
            break;
        }
        boolean foundSomething = false;
        for (ConsumerRecord<String, ChangeEvent> record : records) {
            ChangeEvent event = record.value();
            String topic = record.topic();
            log.debug("Got event t:{} o:{}", record.topic(), record.offset());
            if (!event.domain().equals(uris.getHost())) {
                // wrong domain, ignore
                continue;
            }
            // check namespace
            if (!uris.isEntityNamespace(event.namespace())) {
                continue;
            }
            if (event.isRedundant()) {
                // This is a redundant event, we can skip it.
                continue;
            }
            // Now we have event that we want to process
            foundSomething = true;
            // Keep max time per topic
            timesByTopic.put(topic, Utils.max(Instant.ofEpochMilli(record.timestamp()), timesByTopic.getOrDefault(topic, null)));
            // Using offset here as RC id since we do not have real RC id (this not being RC poller) but
            // the offset serves the same function in Kafka and is also useful for debugging.
            Change change = new Change(event.title(), event.revision(), event.timestamp(), record.offset());
            Change dupe = changesByTitle.put(change.entityId(), change);
            // This is not a big deal since deletes are relatively rare.
            if (dupe != null && change.revision() > Change.NO_REVISION && (dupe.revision() > change.revision() || dupe.revision() == Change.NO_REVISION)) {
                // need to remove so that order will be correct
                changesByTitle.remove(change.entityId());
                changesByTitle.put(change.entityId(), dupe);
            }
        }
        if (changesByTitle.size() >= batchSize) {
            // We have enough for the batch
            break;
        }
        if (changesByTitle.size() > 0 && !foundSomething) {
            log.info("Did not find anything useful in this batch, returning existing data");
            // wait for more.
            break;
        }
    // TODO: if we already have something and we've spent more than X seconds in the loop,
    // we probably should return without waiting for more
    }
    // Here we are using min, not max, since some topics may be lagging behind
    // and we should catch them up if we are restarted.
    // Note that this means we could get repeated items from more advanced topics,
    // but that's ok, since changes are checked by revid anyway.
    // This is important only on updater restart, otherwise Kafka offsets should
    // take care of tracking things.
    Instant nextInstant = timesByTopic.values().stream().min(Instant::compareTo).orElse(lastNextStartTime);
    // FIXME: Note that due to batching nature of Kafka this timestamp could actually jump
    // back and forth. Not sure what to do about it.
    final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
    log.info("Found {} changes", changes.size());
    long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextInstant);
    // be sure we got the whole second
    return new Batch(changes, advanced, nextInstant.minusSeconds(1).toString(), nextInstant);
}
Also used : Instant(java.time.Instant) InterruptException(org.apache.kafka.common.errors.InterruptException) WakeupException(org.apache.kafka.common.errors.WakeupException) LinkedHashMap(java.util.LinkedHashMap) RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) PropertiesChangeEvent(org.wikidata.query.rdf.tool.change.events.PropertiesChangeEvent)

Example 3 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class WikibaseRepository method delete.

/**
 * Delete entity from repository.
 * @param entityId
 * @throws RetryableException thrown if there is an error communicating with
 *             wikibase
 */
public void delete(String entityId) throws RetryableException {
    URI uri = uris.delete(entityId);
    log.debug("Deleting entity {} using {}", entityId, uri);
    try {
        DeleteResponse result = checkApi(getJson(postWithToken(uri), DeleteResponse.class));
        log.debug("Deleted: {}", result);
    } catch (IOException e) {
        throw new RetryableException("Error deleting page", e);
    }
}
Also used : RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) URI(java.net.URI)

Example 4 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class WikibaseRepository method setLabel.

/**
 * Edits or creates a page by setting a label. Used for testing.
 *
 * @param entityId id of the entity - if null then the entity will be
 *            created new
 * @param type type of entity to create or edit
 * @param label label of the page to create
 * @param language language of the label to add
 * @return the entityId
 * @throws RetryableException thrown if there is an error communicating with
 *             wikibase
 */
@SuppressWarnings("unchecked")
public String setLabel(String entityId, String type, String label, String language) throws RetryableException {
    String datatype = type.equals("property") ? "string" : null;
    EditRequest data = new EditRequest(datatype, ImmutableMap.of(language, new Label(language, label)));
    try {
        URI uri = uris.edit(entityId, type, mapper.writeValueAsString(data));
        log.debug("Editing entity using {}", uri);
        EditResponse result = checkApi(getJson(postWithToken(uri), EditResponse.class));
        return result.getEntity().getId();
    } catch (IOException e) {
        throw new RetryableException("Error adding page", e);
    }
}
Also used : RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) Label(org.wikidata.query.rdf.tool.wikibase.EditRequest.Label) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) URI(java.net.URI)

Example 5 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class WikibaseRepository method firstEntityIdForLabelStartingWith.

/**
 * Get the first id with the provided label in the provided language.
 *
 * @throws RetryableException thrown if there is an error communicating with
 *             wikibase
 */
public String firstEntityIdForLabelStartingWith(String label, String language, String type) throws RetryableException {
    URI uri = uris.searchForLabel(label, language, type);
    log.debug("Searching for entity using {}", uri);
    try {
        SearchResponse result = checkApi(getJson(new HttpGet(uri), SearchResponse.class));
        List<SearchResult> resultList = result.getSearch();
        if (resultList.isEmpty()) {
            return null;
        }
        return resultList.get(0).getId();
    } catch (IOException e) {
        throw new RetryableException("Error searching for page", e);
    }
}
Also used : RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) HttpGet(org.apache.http.client.methods.HttpGet) SearchResult(org.wikidata.query.rdf.tool.wikibase.SearchResponse.SearchResult) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) URI(java.net.URI)

Aggregations

RetryableException (org.wikidata.query.rdf.tool.exception.RetryableException)9 IOException (java.io.IOException)5 InterruptedIOException (java.io.InterruptedIOException)5 URI (java.net.URI)5 Instant (java.time.Instant)3 HttpGet (org.apache.http.client.methods.HttpGet)3 SocketException (java.net.SocketException)2 UnknownHostException (java.net.UnknownHostException)2 ExecutionException (java.util.concurrent.ExecutionException)2 ChangeEvent (org.wikidata.query.rdf.tool.change.events.ChangeEvent)2 ContainedException (org.wikidata.query.rdf.tool.exception.ContainedException)2 JsonParseException (com.fasterxml.jackson.core.JsonParseException)1 JsonMappingException (com.fasterxml.jackson.databind.JsonMappingException)1 ImmutableList (com.google.common.collect.ImmutableList)1 Maps (com.google.common.collect.Maps)1 InputStreamReader (java.io.InputStreamReader)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1