Search in sources :

Example 1 with RecentChangeResponse

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse in project wikidata-query-rdf by wikimedia.

the class RecentChangesPoller method batch.

/**
 * Parse a batch from the api result.
 *
 * @throws RetryableException on parse failure
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch batch(Instant lastNextStartTime, Batch lastBatch) throws RetryableException {
    RecentChangeResponse recentChanges = fetchRecentChanges(lastNextStartTime, lastBatch);
    // Using LinkedHashMap here so that changes came out sorted by order of arrival
    Map<String, Change> changesByTitle = new LinkedHashMap<>();
    Continue nextContinue = recentChanges.getContinue();
    Instant nextStartTime = lastNextStartTime;
    List<RecentChange> result = recentChanges.getQuery().getRecentChanges();
    for (RecentChange rc : result) {
        // Does not matter if the change matters for us or not, it
        // still advances the time since we've seen it.
        nextStartTime = Utils.max(nextStartTime, rc.getTimestamp());
        if (rc.getNs() == null) {
            log.warn("Skipping change without a namespace:  {}", rc);
            continue;
        }
        if (!wikibase.isEntityNamespace(rc.getNs())) {
            log.info("Skipping change in irrelevant namespace:  {}", rc);
            continue;
        }
        if (!wikibase.isValidEntity(rc.getTitle())) {
            log.info("Skipping change with bogus title:  {}", rc.getTitle());
            continue;
        }
        if (seenIDs.containsKey(rc.getRcId())) {
            // This change was in the last batch
            log.debug("Skipping repeated change with rcid {}", rc.getRcId());
            continue;
        }
        seenIDs.put(rc.getRcId(), TRUE);
        // Looks like we can not rely on changes appearing in order in RecentChanges,
        // so we have to take them all and let SPARQL sort out the dupes.
        Change change;
        if (rc.getType().equals("log") && rc.getRevId() == 0) {
            // Deletes should always be processed, so put negative revision
            change = new Change(rc.getTitle(), -1L, rc.getTimestamp(), rc.getRcId());
        } else {
            change = new Change(rc.getTitle(), rc.getRevId(), rc.getTimestamp(), rc.getRcId());
        }
        /*
             * Remove duplicate changes by title keeping the latest
             * revision. Note that negative revision means always update, so those
             * are kept.
             */
        Change dupe = changesByTitle.put(change.entityId(), change);
        if (dupe != null && (dupe.revision() > change.revision() || dupe.revision() < 0)) {
            // need to remove so that order will be correct
            changesByTitle.remove(change.entityId());
            changesByTitle.put(change.entityId(), dupe);
        }
    }
    final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
    // Backoff overflow is when:
    // a. We use backoff
    // b. We got full batch of changes.
    // c. None of those were new changes.
    // In this case, sleeping and trying again is obviously useless.
    final boolean backoffOverflow = useBackoff && changes.isEmpty() && result.size() >= batchSize;
    if (backoffOverflow) {
        // We have a problem here - due to backoff, we did not fetch any new items
        // Try to advance one second, even though we risk to lose a change - in hope
        // that trailing poller will pick them up.
        nextStartTime = nextStartTime.plusSeconds(1);
        log.info("Backoff overflow, advancing next time to {}", nextStartTime);
    }
    if (!changes.isEmpty()) {
        log.info("Got {} changes, from {} to {}", changes.size(), changes.get(0), changes.get(changes.size() - 1));
    } else {
        log.info("Got no real changes");
    }
    // Show the user the polled time - one second because we can't
    // be sure we got the whole second
    long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextStartTime);
    Batch batch = new Batch(changes, advanced, nextStartTime.minusSeconds(1).toString(), nextStartTime, nextContinue);
    if (backoffOverflow && nextContinue != null) {
        // We will not sleep if continue is provided.
        log.info("Got only old changes, next is: {}", nextContinue);
        batch.hasChanges(true);
    }
    return batch;
}
Also used : Instant(java.time.Instant) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) LinkedHashMap(java.util.LinkedHashMap) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse)

Example 2 with RecentChangeResponse

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse in project wikidata-query-rdf by wikimedia.

the class RecentChangesPollerUnitTest method continuePoll.

/**
 * Check that continuing works.
 * Check that poller passes continuation to the next batch.
 * @throws RetryableException
 */
@Test
@SuppressWarnings("unchecked")
public void continuePoll() throws RetryableException {
    // Use old date to remove backoff
    Instant startTime = Instant.now().minus(10, ChronoUnit.DAYS);
    int batchSize = 10;
    Instant revDate = startTime.plusSeconds(20);
    WikibaseApiError error = null;
    Continue aContinue = new Continue(OUTPUT_DATE_FORMATTER.format(revDate) + "|8", "-||");
    List<RecentChange> recentChanges = new ArrayList<>();
    recentChanges.add(new RecentChange(0L, "Q666", revDate, 1L, 1L, "edit"));
    recentChanges.add(new RecentChange(0L, "Q667", revDate, 7L, 7L, "edit"));
    Query query = new Query(recentChanges);
    RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
    String date = revDate.toString();
    firstBatchReturns(startTime, result);
    RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
    Batch batch = poller.firstBatch();
    assertThat(batch.changes()).hasSize(2);
    assertThat(batch.changes().get(1).offset()).isEqualTo(7);
    assertThat(batch.leftOffDate().toString()).isEqualTo(date);
    assertThat(batch.getLastContinue()).isEqualTo(aContinue);
    ArgumentCaptor<Instant> argumentDate = ArgumentCaptor.forClass(Instant.class);
    ArgumentCaptor<Continue> continueCaptor = ArgumentCaptor.forClass(Continue.class);
    recentChanges.clear();
    when(repository.fetchRecentChanges(argumentDate.capture(), continueCaptor.capture(), eq(batchSize))).thenReturn(result);
    // check that poller passes the continue object to the next batch
    batch = poller.nextBatch(batch);
    assertThat(batch.changes()).hasSize(0);
    assertThat(argumentDate.getValue().toString()).isEqualTo(date);
    assertThat(continueCaptor.getValue()).isEqualTo(aContinue);
}
Also used : Query(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query) Instant(java.time.Instant) MetricRegistry(com.codahale.metrics.MetricRegistry) ArrayList(java.util.ArrayList) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) WikibaseApiError(org.wikidata.query.rdf.tool.wikibase.WikibaseApiError) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Batch(org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse) Test(org.junit.Test)

Example 3 with RecentChangeResponse

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse in project wikidata-query-rdf by wikimedia.

the class RecentChangesPollerUnitTest method dedups.

/**
 * Check deduplication.
 * Create 20 changes, of which each two are dupes,
 * check that dupes are eliminated.
 * @throws RetryableException
 */
@Test
@SuppressWarnings("unchecked")
public void dedups() throws RetryableException {
    Instant startTime = Instant.now();
    // Build a result from wikibase with duplicate recent changes
    List<RecentChange> recentChanges = new ArrayList<>();
    // 20 entries with 10 total Q ids
    for (long i = 0; i < 20; i++) {
        RecentChange rc = new RecentChange(0L, "Q" + (i / 2), Instant.now(), i, i, "edit");
        recentChanges.add(rc);
    }
    Query query = new Query(recentChanges);
    WikibaseApiError error = null;
    Continue aContinue = null;
    RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
    firstBatchReturns(startTime, result);
    RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
    Batch batch = poller.firstBatch();
    assertThat(batch.changes()).hasSize(10);
    List<Change> changes = new ArrayList<>(batch.changes());
    Collections.sort(changes, Comparator.comparing(Change::entityId));
    for (int i = 0; i < 10; i++) {
        assertThat(changes.get(i).entityId()).isEqualTo("Q" + i);
        assertThat(changes.get(i).revision()).isEqualTo(2 * i + 1);
    }
}
Also used : Query(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query) Instant(java.time.Instant) MetricRegistry(com.codahale.metrics.MetricRegistry) ArrayList(java.util.ArrayList) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) WikibaseApiError(org.wikidata.query.rdf.tool.wikibase.WikibaseApiError) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Batch(org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse) Test(org.junit.Test)

Example 4 with RecentChangeResponse

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse in project wikidata-query-rdf by wikimedia.

the class RecentChangesPollerUnitTest method backoffOverflow.

/**
 * Backoff overflow check,
 * Check that if we're backing off but find no new changes then time is advanced.
 * @throws RetryableException
 */
@Test
public void backoffOverflow() throws RetryableException {
    Instant startTime = Instant.now();
    batchSize = 1;
    RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
    WikibaseApiError error = null;
    Continue aContinue = null;
    ArrayList<RecentChange> recentChanges = new ArrayList<>();
    recentChanges.add(new RecentChange(0L, "Q424242", startTime, 42L, 42L, "edit"));
    Query query = new Query(recentChanges);
    RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
    firstBatchReturns(startTime, result);
    Batch batch = poller.firstBatch();
    assertThat(batch.changes()).hasSize(1);
    assertThat(batch.leftOffDate()).isEqualTo(startTime);
    batch = poller.nextBatch(batch);
    assertThat(batch.changes()).hasSize(0);
    assertThat(startTime).isBefore(batch.leftOffDate());
    assertThat(startTime.plusSeconds(1)).isEqualTo(batch.leftOffDate());
}
Also used : WikibaseApiError(org.wikidata.query.rdf.tool.wikibase.WikibaseApiError) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Query(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query) Batch(org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch) Instant(java.time.Instant) MetricRegistry(com.codahale.metrics.MetricRegistry) ArrayList(java.util.ArrayList) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse) Test(org.junit.Test)

Example 5 with RecentChangeResponse

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse in project wikidata-query-rdf by wikimedia.

the class RecentChangesPollerUnitTest method backoffTime.

/**
 * Check that recent requests use backoff.
 * @throws RetryableException
 */
@Test
@SuppressWarnings("unchecked")
public void backoffTime() throws RetryableException {
    Instant startTime = Instant.now();
    RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
    Instant nextStartTime = startTime.plusSeconds(20);
    WikibaseApiError error = null;
    Continue aContinue = null;
    List<RecentChange> recentChanges = new ArrayList<>();
    recentChanges.add(new RecentChange(0L, "Q424242", nextStartTime, 42L, 42L, "edit"));
    Query query = new Query(recentChanges);
    RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
    ArgumentCaptor<Instant> argument = ArgumentCaptor.forClass(Instant.class);
    when(repository.fetchRecentChangesByTime(argument.capture(), eq(batchSize))).thenReturn(result);
    when(repository.isEntityNamespace(0)).thenReturn(true);
    when(repository.isValidEntity(any(String.class))).thenReturn(true);
    Batch batch = poller.firstBatch();
    // Ensure we backed off at least 7 seconds but no more than 20
    assertThat(argument.getValue()).isBefore(startTime.minusSeconds(7));
    assertThat(argument.getValue()).isAfter(startTime.minusSeconds(20));
    // Verify that backoff still works on the second call
    batch = poller.nextBatch(batch);
    // verify we're still using fetchRecentChangesByTime
    assertThat(batch).isNotNull();
    assertThat(argument.getValue()).isBefore(nextStartTime.minusSeconds(7));
    assertThat(argument.getValue()).isAfter(nextStartTime.minusSeconds(20));
}
Also used : Query(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query) Instant(java.time.Instant) MetricRegistry(com.codahale.metrics.MetricRegistry) ArrayList(java.util.ArrayList) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) WikibaseApiError(org.wikidata.query.rdf.tool.wikibase.WikibaseApiError) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Batch(org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse) Test(org.junit.Test)

Aggregations

RecentChangeResponse (org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse)8 Instant (java.time.Instant)7 Continue (org.wikidata.query.rdf.tool.wikibase.Continue)7 MetricRegistry (com.codahale.metrics.MetricRegistry)6 Batch (org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch)6 Query (org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query)6 RecentChange (org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange)6 WikibaseApiError (org.wikidata.query.rdf.tool.wikibase.WikibaseApiError)6 ArrayList (java.util.ArrayList)5 Test (org.junit.Test)5 Context (com.codahale.metrics.Timer.Context)1 LinkedHashMap (java.util.LinkedHashMap)1