Search in sources :

Example 1 with RecentChange

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange in project wikidata-query-rdf by wikimedia.

the class WikibaseRepositoryIntegrationTest method editShowsUpInRecentChangesTestCase.

private void editShowsUpInRecentChangesTestCase(String label, String type) throws RetryableException, ContainedException, IOException, URISyntaxException {
    String entityId = firstEntityIdForLabelStartingWith(baseUri, label, "en", type);
    List<RecentChange> changes = getRecentChanges(START_TIME.minusSeconds(10), 10);
    boolean found = false;
    String title = entityId;
    if (type.equals("property")) {
        title = "Property:" + title;
    }
    for (RecentChange change : changes) {
        if (change.getTitle().equals(title)) {
            found = true;
            assertNotNull(change.getRevId());
            break;
        }
    }
    assertTrue("Didn't find new page in recent changes", found);
    Collection<Statement> statements = repo.get().fetchRdfForEntity(entityId);
    found = false;
    for (Statement statement : statements) {
        if (statement.getSubject().stringValue().equals(uris.entityIdToURI(entityId))) {
            found = true;
            break;
        }
    }
    assertTrue("Didn't find entity information in rdf", found);
}
Also used : RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Statement(org.openrdf.model.Statement)

Example 2 with RecentChange

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange in project wikidata-query-rdf by wikimedia.

the class RecentChangesPoller method batch.

/**
 * Parse a batch from the api result.
 *
 * @throws RetryableException on parse failure
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch batch(Instant lastNextStartTime, Batch lastBatch) throws RetryableException {
    RecentChangeResponse recentChanges = fetchRecentChanges(lastNextStartTime, lastBatch);
    // Using LinkedHashMap here so that changes came out sorted by order of arrival
    Map<String, Change> changesByTitle = new LinkedHashMap<>();
    Continue nextContinue = recentChanges.getContinue();
    Instant nextStartTime = lastNextStartTime;
    List<RecentChange> result = recentChanges.getQuery().getRecentChanges();
    for (RecentChange rc : result) {
        // Does not matter if the change matters for us or not, it
        // still advances the time since we've seen it.
        nextStartTime = Utils.max(nextStartTime, rc.getTimestamp());
        if (rc.getNs() == null) {
            log.warn("Skipping change without a namespace:  {}", rc);
            continue;
        }
        if (!wikibase.isEntityNamespace(rc.getNs())) {
            log.info("Skipping change in irrelevant namespace:  {}", rc);
            continue;
        }
        if (!wikibase.isValidEntity(rc.getTitle())) {
            log.info("Skipping change with bogus title:  {}", rc.getTitle());
            continue;
        }
        if (seenIDs.containsKey(rc.getRcId())) {
            // This change was in the last batch
            log.debug("Skipping repeated change with rcid {}", rc.getRcId());
            continue;
        }
        seenIDs.put(rc.getRcId(), TRUE);
        // Looks like we can not rely on changes appearing in order in RecentChanges,
        // so we have to take them all and let SPARQL sort out the dupes.
        Change change;
        if (rc.getType().equals("log") && rc.getRevId() == 0) {
            // Deletes should always be processed, so put negative revision
            change = new Change(rc.getTitle(), -1L, rc.getTimestamp(), rc.getRcId());
        } else {
            change = new Change(rc.getTitle(), rc.getRevId(), rc.getTimestamp(), rc.getRcId());
        }
        /*
             * Remove duplicate changes by title keeping the latest
             * revision. Note that negative revision means always update, so those
             * are kept.
             */
        Change dupe = changesByTitle.put(change.entityId(), change);
        if (dupe != null && (dupe.revision() > change.revision() || dupe.revision() < 0)) {
            // need to remove so that order will be correct
            changesByTitle.remove(change.entityId());
            changesByTitle.put(change.entityId(), dupe);
        }
    }
    final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
    // Backoff overflow is when:
    // a. We use backoff
    // b. We got full batch of changes.
    // c. None of those were new changes.
    // In this case, sleeping and trying again is obviously useless.
    final boolean backoffOverflow = useBackoff && changes.isEmpty() && result.size() >= batchSize;
    if (backoffOverflow) {
        // We have a problem here - due to backoff, we did not fetch any new items
        // Try to advance one second, even though we risk to lose a change - in hope
        // that trailing poller will pick them up.
        nextStartTime = nextStartTime.plusSeconds(1);
        log.info("Backoff overflow, advancing next time to {}", nextStartTime);
    }
    if (!changes.isEmpty()) {
        log.info("Got {} changes, from {} to {}", changes.size(), changes.get(0), changes.get(changes.size() - 1));
    } else {
        log.info("Got no real changes");
    }
    // Show the user the polled time - one second because we can't
    // be sure we got the whole second
    long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextStartTime);
    Batch batch = new Batch(changes, advanced, nextStartTime.minusSeconds(1).toString(), nextStartTime, nextContinue);
    if (backoffOverflow && nextContinue != null) {
        // We will not sleep if continue is provided.
        log.info("Got only old changes, next is: {}", nextContinue);
        batch.hasChanges(true);
    }
    return batch;
}
Also used : Instant(java.time.Instant) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) LinkedHashMap(java.util.LinkedHashMap) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse)

Example 3 with RecentChange

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange in project wikidata-query-rdf by wikimedia.

the class RecentChangesPollerUnitTest method continuePoll.

/**
 * Check that continuing works.
 * Check that poller passes continuation to the next batch.
 * @throws RetryableException
 */
@Test
@SuppressWarnings("unchecked")
public void continuePoll() throws RetryableException {
    // Use old date to remove backoff
    Instant startTime = Instant.now().minus(10, ChronoUnit.DAYS);
    int batchSize = 10;
    Instant revDate = startTime.plusSeconds(20);
    WikibaseApiError error = null;
    Continue aContinue = new Continue(OUTPUT_DATE_FORMATTER.format(revDate) + "|8", "-||");
    List<RecentChange> recentChanges = new ArrayList<>();
    recentChanges.add(new RecentChange(0L, "Q666", revDate, 1L, 1L, "edit"));
    recentChanges.add(new RecentChange(0L, "Q667", revDate, 7L, 7L, "edit"));
    Query query = new Query(recentChanges);
    RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
    String date = revDate.toString();
    firstBatchReturns(startTime, result);
    RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
    Batch batch = poller.firstBatch();
    assertThat(batch.changes()).hasSize(2);
    assertThat(batch.changes().get(1).offset()).isEqualTo(7);
    assertThat(batch.leftOffDate().toString()).isEqualTo(date);
    assertThat(batch.getLastContinue()).isEqualTo(aContinue);
    ArgumentCaptor<Instant> argumentDate = ArgumentCaptor.forClass(Instant.class);
    ArgumentCaptor<Continue> continueCaptor = ArgumentCaptor.forClass(Continue.class);
    recentChanges.clear();
    when(repository.fetchRecentChanges(argumentDate.capture(), continueCaptor.capture(), eq(batchSize))).thenReturn(result);
    // check that poller passes the continue object to the next batch
    batch = poller.nextBatch(batch);
    assertThat(batch.changes()).hasSize(0);
    assertThat(argumentDate.getValue().toString()).isEqualTo(date);
    assertThat(continueCaptor.getValue()).isEqualTo(aContinue);
}
Also used : Query(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query) Instant(java.time.Instant) MetricRegistry(com.codahale.metrics.MetricRegistry) ArrayList(java.util.ArrayList) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) WikibaseApiError(org.wikidata.query.rdf.tool.wikibase.WikibaseApiError) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Batch(org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse) Test(org.junit.Test)

Example 4 with RecentChange

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange in project wikidata-query-rdf by wikimedia.

the class RecentChangesPollerUnitTest method dedups.

/**
 * Check deduplication.
 * Create 20 changes, of which each two are dupes,
 * check that dupes are eliminated.
 * @throws RetryableException
 */
@Test
@SuppressWarnings("unchecked")
public void dedups() throws RetryableException {
    Instant startTime = Instant.now();
    // Build a result from wikibase with duplicate recent changes
    List<RecentChange> recentChanges = new ArrayList<>();
    // 20 entries with 10 total Q ids
    for (long i = 0; i < 20; i++) {
        RecentChange rc = new RecentChange(0L, "Q" + (i / 2), Instant.now(), i, i, "edit");
        recentChanges.add(rc);
    }
    Query query = new Query(recentChanges);
    WikibaseApiError error = null;
    Continue aContinue = null;
    RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
    firstBatchReturns(startTime, result);
    RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
    Batch batch = poller.firstBatch();
    assertThat(batch.changes()).hasSize(10);
    List<Change> changes = new ArrayList<>(batch.changes());
    Collections.sort(changes, Comparator.comparing(Change::entityId));
    for (int i = 0; i < 10; i++) {
        assertThat(changes.get(i).entityId()).isEqualTo("Q" + i);
        assertThat(changes.get(i).revision()).isEqualTo(2 * i + 1);
    }
}
Also used : Query(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query) Instant(java.time.Instant) MetricRegistry(com.codahale.metrics.MetricRegistry) ArrayList(java.util.ArrayList) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Continue(org.wikidata.query.rdf.tool.wikibase.Continue) WikibaseApiError(org.wikidata.query.rdf.tool.wikibase.WikibaseApiError) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Batch(org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch) RecentChangeResponse(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse) Test(org.junit.Test)

Example 5 with RecentChange

use of org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange in project wikidata-query-rdf by wikimedia.

the class WikibaseRepositoryIntegrationTest method continueWorks.

@Test
public void continueWorks() throws ContainedException, InterruptedException, URISyntaxException, IOException, RetryableException {
    String entityId = firstEntityIdForLabelStartingWith(baseUri, "QueryTestItem", "en", "item");
    List<RecentChange> changes = getRecentChanges(START_TIME.minusSeconds(10), 10);
    Change change = null;
    Long oldRevid = 0L;
    Long oldRcid = 0L;
    for (RecentChange rc : changes) {
        if (rc.getTitle().equals(entityId)) {
            oldRevid = rc.getRevId();
            oldRcid = rc.getRcId();
            change = new Change(rc.getTitle(), oldRevid, rc.getTimestamp(), oldRcid);
            break;
        }
    }
    assertNotNull("Did not find the first edit", change);
    // Ensure this change is in different second
    // make new edit now
    changes = getRecentChanges(change.timestamp().plusSeconds(1), 10);
    // check that new result does not contain old edit but contains new edit
    boolean found = false;
    for (RecentChange rc : changes) {
        if (rc.getTitle().equals(entityId)) {
            assertNotEquals("Found old edit after continue: revid", oldRevid, rc.getRevId());
            assertNotEquals("Found old edit after continue: offset", oldRcid, rc.getRcId());
            found = true;
        }
    }
    assertTrue("Did not find new edit", found);
}
Also used : RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) RecentChange(org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange) Change(org.wikidata.query.rdf.tool.change.Change) Test(org.junit.Test)

Aggregations

RecentChange (org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.RecentChange)9 Instant (java.time.Instant)7 Test (org.junit.Test)7 Continue (org.wikidata.query.rdf.tool.wikibase.Continue)6 RecentChangeResponse (org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse)6 MetricRegistry (com.codahale.metrics.MetricRegistry)5 ArrayList (java.util.ArrayList)5 Batch (org.wikidata.query.rdf.tool.change.RecentChangesPoller.Batch)5 Query (org.wikidata.query.rdf.tool.wikibase.RecentChangeResponse.Query)5 WikibaseApiError (org.wikidata.query.rdf.tool.wikibase.WikibaseApiError)5 LinkedHashMap (java.util.LinkedHashMap)1 Statement (org.openrdf.model.Statement)1 Change (org.wikidata.query.rdf.tool.change.Change)1