Search in sources :

Example 6 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class WikibaseRepository method fetchRdfForEntity.

/**
 * Fetch the RDF for some entity.
 *
 * @throws RetryableException thrown if there is an error communicating with
 *             wikibase
 */
public Collection<Statement> fetchRdfForEntity(String entityId) throws RetryableException {
    // TODO handle ?flavor=dump or whatever parameters we need
    URI uri = uris.rdf(entityId);
    long start = System.currentTimeMillis();
    log.debug("Fetching rdf from {}", uri);
    RDFParser parser = Rio.createParser(RDFFormat.TURTLE);
    StatementCollector collector = new StatementCollector();
    parser.setRDFHandler(new NormalizingRdfHandler(collector));
    HttpGet request = new HttpGet(uri);
    request.setConfig(configWithTimeout);
    try {
        try (CloseableHttpResponse response = client.execute(request)) {
            if (response.getStatusLine().getStatusCode() == 404) {
                // A delete/nonexistent page
                return Collections.emptyList();
            }
            if (response.getStatusLine().getStatusCode() >= 300) {
                throw new ContainedException("Unexpected status code fetching RDF for " + uri + ":  " + response.getStatusLine().getStatusCode());
            }
            parser.parse(new InputStreamReader(response.getEntity().getContent(), Charsets.UTF_8), uri.toString());
        }
    } catch (UnknownHostException | SocketException | SSLHandshakeException e) {
        // We want to bail on this, since it happens to be sticky for some reason
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RetryableException("Error fetching RDF for " + uri, e);
    } catch (RDFParseException | RDFHandlerException e) {
        throw new ContainedException("RDF parsing error for " + uri, e);
    }
    log.debug("Done in {} ms", System.currentTimeMillis() - start);
    return collector.getStatements();
}
Also used : SocketException(java.net.SocketException) InputStreamReader(java.io.InputStreamReader) UnknownHostException(java.net.UnknownHostException) StatementCollector(org.openrdf.rio.helpers.StatementCollector) HttpGet(org.apache.http.client.methods.HttpGet) ContainedException(org.wikidata.query.rdf.tool.exception.ContainedException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) RDFParser(org.openrdf.rio.RDFParser) URI(java.net.URI) SSLHandshakeException(javax.net.ssl.SSLHandshakeException) NormalizingRdfHandler(org.wikidata.query.rdf.tool.rdf.NormalizingRdfHandler) RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) RDFHandlerException(org.openrdf.rio.RDFHandlerException) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) RDFParseException(org.openrdf.rio.RDFParseException)

Example 7 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class KafkaPollerUnitTest method topicSubscribe.

@Test
public void topicSubscribe() throws RetryableException {
    Instant startTime = Instant.ofEpochMilli(BEGIN_DATE);
    Collection<String> topics = ImmutableList.of("topictest", "othertopic");
    ImmutableList<PartitionInfo> twoParts = ImmutableList.of(makePartitionInfo(0), makePartitionInfo(1));
    ArgumentCaptor<String> partitionArgs = ArgumentCaptor.forClass(String.class);
    when(consumer.partitionsFor(partitionArgs.capture())).thenReturn(twoParts);
    ArgumentCaptor<Collection<TopicPartition>> assignArgs = ArgumentCaptor.forClass((Class) Collection.class);
    doNothing().when(consumer).assign(assignArgs.capture());
    when(consumer.offsetsForTimes(any())).thenAnswer(i -> {
        Map<TopicPartition, Long> map = i.getArgumentAt(0, Map.class);
        // Check that timestamps are OK
        map.forEach((k, v) -> assertThat(v, equalTo(BEGIN_DATE)));
        Map<TopicPartition, OffsetAndTimestamp> out = Maps.newHashMapWithExpectedSize(map.size());
        // Make offset 1 for first partition and nothing for second
        map.forEach((k, v) -> out.put(k, k.partition() == 0 ? new OffsetAndTimestamp(1000, v) : null));
        // Using forEach here because collect() can't handle nulls
        return out;
    });
    ArgumentCaptor<TopicPartition> seekArgs = ArgumentCaptor.forClass(TopicPartition.class);
    doNothing().when(consumer).seek(seekArgs.capture(), eq(1000L));
    ArgumentCaptor<Collection<TopicPartition>> seekBeginningArgs = ArgumentCaptor.forClass((Class) Collection.class);
    doNothing().when(consumer).seekToEnd(seekBeginningArgs.capture());
    when(consumer.poll(anyLong())).thenReturn(EMPTY_CHANGES);
    KafkaPoller poller = new KafkaPoller(consumer, uris, startTime, BATCH_SIZE, topics);
    Batch batch = poller.firstBatch();
    // We get partitions for both topics
    verify(consumer, times(2)).partitionsFor(any());
    assertThat(partitionArgs.getAllValues(), contains("topictest", "othertopic"));
    // We assign to 4 topics - 2 topics x 2 partitions
    verify(consumer, times(1)).assign(any());
    assertThat(assignArgs.getValue(), hasSize(4));
    // Calling seek on both topics, partition 0
    verify(consumer, times(2)).seek(any(), anyLong());
    assertThat(seekArgs.getAllValues().stream().map(p -> p.topic()).toArray(), arrayContainingInAnyOrder("topictest", "othertopic"));
    Collection<String> sTopics = seekArgs.getAllValues().stream().map(tp -> tp.topic()).collect(Collectors.toList());
    assertThat(sTopics, hasSize(2));
    assertThat(sTopics, containsInAnyOrder("topictest", "othertopic"));
    Collection<Integer> sPartitions = seekArgs.getAllValues().stream().map(tp -> tp.partition()).distinct().collect(Collectors.toList());
    assertThat(sPartitions, hasSize(1));
    assertThat(sPartitions, contains(0));
    // Calling seekToEnd on both topics, partition 1
    verify(consumer, times(2)).seekToEnd(any());
    Collection<String> sbTopics = seekBeginningArgs.getAllValues().stream().flatMap(c -> c.stream()).map(tp -> tp.topic()).collect(Collectors.toList());
    assertThat(sbTopics, hasSize(2));
    assertThat(sbTopics, contains("topictest", "othertopic"));
    Collection<Integer> sbPartitions = seekBeginningArgs.getAllValues().stream().flatMap(c -> c.stream()).map(tp -> tp.partition()).distinct().collect(Collectors.toList());
    assertThat(sbPartitions, hasSize(1));
    assertThat(sbPartitions, contains(1));
    verify(consumer, times(1)).offsetsForTimes(any());
}
Also used : Arrays(java.util.Arrays) Mock(org.mockito.Mock) ChangeEvent(org.wikidata.query.rdf.tool.change.events.ChangeEvent) Matchers.not(org.hamcrest.Matchers.not) ConsumerRecords(org.apache.kafka.clients.consumer.ConsumerRecords) RevisionCreateEvent(org.wikidata.query.rdf.tool.change.events.RevisionCreateEvent) Matchers.arrayContainingInAnyOrder(org.hamcrest.Matchers.arrayContainingInAnyOrder) Assert.assertThat(org.junit.Assert.assertThat) MockitoAnnotations(org.mockito.MockitoAnnotations) ChangeMatchers.hasRevision(org.wikidata.query.rdf.tool.change.ChangeMatchers.hasRevision) ArgumentCaptor(org.mockito.ArgumentCaptor) ImmutableList(com.google.common.collect.ImmutableList) Matchers.eq(org.mockito.Matchers.eq) PageDeleteEvent(org.wikidata.query.rdf.tool.change.events.PageDeleteEvent) Map(java.util.Map) Matchers.anyLong(org.mockito.Matchers.anyLong) Matchers.hasSize(org.hamcrest.Matchers.hasSize) RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) TimestampType(org.apache.kafka.common.record.TimestampType) Uris(org.wikidata.query.rdf.tool.wikibase.WikibaseRepository.Uris) Before(org.junit.Before) TopicPartition(org.apache.kafka.common.TopicPartition) Collection(java.util.Collection) Mockito.times(org.mockito.Mockito.times) ChangeMatchers.hasTitle(org.wikidata.query.rdf.tool.change.ChangeMatchers.hasTitle) Test(org.junit.Test) Mockito.when(org.mockito.Mockito.when) Mockito.doNothing(org.mockito.Mockito.doNothing) PartitionInfo(org.apache.kafka.common.PartitionInfo) Instant(java.time.Instant) OffsetAndTimestamp(org.apache.kafka.clients.consumer.OffsetAndTimestamp) Collectors(java.util.stream.Collectors) Maps(com.google.common.collect.Maps) Mockito.verify(org.mockito.Mockito.verify) Matchers.any(org.mockito.Matchers.any) Matchers.hasItem(org.hamcrest.Matchers.hasItem) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Assert.assertFalse(org.junit.Assert.assertFalse) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) EventsMeta(org.wikidata.query.rdf.tool.change.events.EventsMeta) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Batch(org.wikidata.query.rdf.tool.change.KafkaPoller.Batch) Collections(java.util.Collections) ChangeMatchers.hasTitleRevision(org.wikidata.query.rdf.tool.change.ChangeMatchers.hasTitleRevision) Mockito.mock(org.mockito.Mockito.mock) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) Instant(java.time.Instant) Batch(org.wikidata.query.rdf.tool.change.KafkaPoller.Batch) TopicPartition(org.apache.kafka.common.TopicPartition) Matchers.anyLong(org.mockito.Matchers.anyLong) Collection(java.util.Collection) PartitionInfo(org.apache.kafka.common.PartitionInfo) OffsetAndTimestamp(org.apache.kafka.clients.consumer.OffsetAndTimestamp) Test(org.junit.Test)

Example 8 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class Updater method run.

@Override
public void run() {
    B batch = null;
    do {
        try {
            batch = changeSource.firstBatch();
        } catch (RetryableException e) {
            log.warn("Retryable error fetching first batch.  Retrying.", e);
        }
    } while (batch == null);
    log.debug("{} changes in batch", batch.changes().size());
    Instant oldDate = null;
    while (!currentThread().isInterrupted()) {
        try {
            handleChanges(batch.changes());
            Instant leftOffDate = batch.leftOffDate();
            if (leftOffDate != null) {
                /*
                     * Back one second because the resolution on our poll isn't
                     * super good and because its not big deal to recheck if we
                     * have some updates.
                     */
                leftOffDate = leftOffDate.minusSeconds(1);
                // Do not update repo with the same date
                if (oldDate == null || !oldDate.equals(leftOffDate)) {
                    syncDate(leftOffDate);
                    oldDate = leftOffDate;
                }
            }
            // TODO wrap all retry-able exceptions in a special exception
            batchAdvanced.mark(batch.advanced());
            log.info("Polled up to {} at {} updates per second and {} {} per second", batch.leftOffHuman(), meterReport(updateMeter), meterReport(batchAdvanced), batch.advancedUnits());
            if (batch.last()) {
                return;
            }
            batch = nextBatch(batch);
        } catch (InterruptedException e) {
            currentThread().interrupt();
        } catch (ExecutionException e) {
            log.error("Syncing encountered a fatal exception", e);
            break;
        }
    }
}
Also used : RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) Instant(java.time.Instant) ExecutionException(java.util.concurrent.ExecutionException)

Example 9 with RetryableException

use of org.wikidata.query.rdf.tool.exception.RetryableException in project wikidata-query-rdf by wikimedia.

the class WikibaseRepository method fetchRecentChanges.

/**
 * Fetch recent changes starting from nextStartTime or continuing from
 * lastContinue depending on the contents of lastContinue way to use
 * MediaWiki. See RecentChangesPoller for how to poll these. Or just use it.
 *
 * @param nextStartTime if lastContinue is null then this is the start time
 *            of the query
 * @param batchSize the number of recent changes to fetch
 * @param lastContinue Continuation object from last batch, or null.
 * @return result of query
 * @throws RetryableException thrown if there is an error communicating with
 *             wikibase
 */
public RecentChangeResponse fetchRecentChanges(Instant nextStartTime, Continue lastContinue, int batchSize) throws RetryableException {
    URI uri = uris.recentChanges(nextStartTime, lastContinue, batchSize);
    log.debug("Polling for changes from {}", uri);
    HttpGet request = new HttpGet(uri);
    request.setConfig(configWithTimeout);
    try {
        return checkApi(getJson(request, RecentChangeResponse.class));
    } catch (UnknownHostException | SocketException e) {
        // We want to bail on this, since it happens to be sticky for some reason
        throw new RuntimeException(e);
    } catch (JsonParseException | JsonMappingException e) {
        // An invalid response will probably not fix itself with a retry, so let's bail
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RetryableException("Error fetching recent changes", e);
    }
}
Also used : SocketException(java.net.SocketException) RetryableException(org.wikidata.query.rdf.tool.exception.RetryableException) UnknownHostException(java.net.UnknownHostException) HttpGet(org.apache.http.client.methods.HttpGet) JsonMappingException(com.fasterxml.jackson.databind.JsonMappingException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) JsonParseException(com.fasterxml.jackson.core.JsonParseException) URI(java.net.URI)

Aggregations

RetryableException (org.wikidata.query.rdf.tool.exception.RetryableException)9 IOException (java.io.IOException)5 InterruptedIOException (java.io.InterruptedIOException)5 URI (java.net.URI)5 Instant (java.time.Instant)3 HttpGet (org.apache.http.client.methods.HttpGet)3 SocketException (java.net.SocketException)2 UnknownHostException (java.net.UnknownHostException)2 ExecutionException (java.util.concurrent.ExecutionException)2 ChangeEvent (org.wikidata.query.rdf.tool.change.events.ChangeEvent)2 ContainedException (org.wikidata.query.rdf.tool.exception.ContainedException)2 JsonParseException (com.fasterxml.jackson.core.JsonParseException)1 JsonMappingException (com.fasterxml.jackson.databind.JsonMappingException)1 ImmutableList (com.google.common.collect.ImmutableList)1 Maps (com.google.common.collect.Maps)1 InputStreamReader (java.io.InputStreamReader)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1