use of org.wikidata.query.rdf.tool.wikibase.Continue in project wikidata-query-rdf by wikimedia.
the class RecentChangesPoller method batch.
/**
* Parse a batch from the api result.
*
* @throws RetryableException on parse failure
*/
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity" })
private Batch batch(Instant lastNextStartTime, Batch lastBatch) throws RetryableException {
RecentChangeResponse recentChanges = fetchRecentChanges(lastNextStartTime, lastBatch);
// Using LinkedHashMap here so that changes came out sorted by order of arrival
Map<String, Change> changesByTitle = new LinkedHashMap<>();
Continue nextContinue = recentChanges.getContinue();
Instant nextStartTime = lastNextStartTime;
List<RecentChange> result = recentChanges.getQuery().getRecentChanges();
for (RecentChange rc : result) {
// Does not matter if the change matters for us or not, it
// still advances the time since we've seen it.
nextStartTime = Utils.max(nextStartTime, rc.getTimestamp());
if (rc.getNs() == null) {
log.warn("Skipping change without a namespace: {}", rc);
continue;
}
if (!wikibase.isEntityNamespace(rc.getNs())) {
log.info("Skipping change in irrelevant namespace: {}", rc);
continue;
}
if (!wikibase.isValidEntity(rc.getTitle())) {
log.info("Skipping change with bogus title: {}", rc.getTitle());
continue;
}
if (seenIDs.containsKey(rc.getRcId())) {
// This change was in the last batch
log.debug("Skipping repeated change with rcid {}", rc.getRcId());
continue;
}
seenIDs.put(rc.getRcId(), TRUE);
// Looks like we can not rely on changes appearing in order in RecentChanges,
// so we have to take them all and let SPARQL sort out the dupes.
Change change;
if (rc.getType().equals("log") && rc.getRevId() == 0) {
// Deletes should always be processed, so put negative revision
change = new Change(rc.getTitle(), -1L, rc.getTimestamp(), rc.getRcId());
} else {
change = new Change(rc.getTitle(), rc.getRevId(), rc.getTimestamp(), rc.getRcId());
}
/*
* Remove duplicate changes by title keeping the latest
* revision. Note that negative revision means always update, so those
* are kept.
*/
Change dupe = changesByTitle.put(change.entityId(), change);
if (dupe != null && (dupe.revision() > change.revision() || dupe.revision() < 0)) {
// need to remove so that order will be correct
changesByTitle.remove(change.entityId());
changesByTitle.put(change.entityId(), dupe);
}
}
final ImmutableList<Change> changes = ImmutableList.copyOf(changesByTitle.values());
// Backoff overflow is when:
// a. We use backoff
// b. We got full batch of changes.
// c. None of those were new changes.
// In this case, sleeping and trying again is obviously useless.
final boolean backoffOverflow = useBackoff && changes.isEmpty() && result.size() >= batchSize;
if (backoffOverflow) {
// We have a problem here - due to backoff, we did not fetch any new items
// Try to advance one second, even though we risk to lose a change - in hope
// that trailing poller will pick them up.
nextStartTime = nextStartTime.plusSeconds(1);
log.info("Backoff overflow, advancing next time to {}", nextStartTime);
}
if (!changes.isEmpty()) {
log.info("Got {} changes, from {} to {}", changes.size(), changes.get(0), changes.get(changes.size() - 1));
} else {
log.info("Got no real changes");
}
// Show the user the polled time - one second because we can't
// be sure we got the whole second
long advanced = ChronoUnit.MILLIS.between(lastNextStartTime, nextStartTime);
Batch batch = new Batch(changes, advanced, nextStartTime.minusSeconds(1).toString(), nextStartTime, nextContinue);
if (backoffOverflow && nextContinue != null) {
// We will not sleep if continue is provided.
log.info("Got only old changes, next is: {}", nextContinue);
batch.hasChanges(true);
}
return batch;
}
use of org.wikidata.query.rdf.tool.wikibase.Continue in project wikidata-query-rdf by wikimedia.
the class RecentChangesPollerUnitTest method continuePoll.
/**
* Check that continuing works.
* Check that poller passes continuation to the next batch.
* @throws RetryableException
*/
@Test
@SuppressWarnings("unchecked")
public void continuePoll() throws RetryableException {
// Use old date to remove backoff
Instant startTime = Instant.now().minus(10, ChronoUnit.DAYS);
int batchSize = 10;
Instant revDate = startTime.plusSeconds(20);
WikibaseApiError error = null;
Continue aContinue = new Continue(OUTPUT_DATE_FORMATTER.format(revDate) + "|8", "-||");
List<RecentChange> recentChanges = new ArrayList<>();
recentChanges.add(new RecentChange(0L, "Q666", revDate, 1L, 1L, "edit"));
recentChanges.add(new RecentChange(0L, "Q667", revDate, 7L, 7L, "edit"));
Query query = new Query(recentChanges);
RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
String date = revDate.toString();
firstBatchReturns(startTime, result);
RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
Batch batch = poller.firstBatch();
assertThat(batch.changes()).hasSize(2);
assertThat(batch.changes().get(1).offset()).isEqualTo(7);
assertThat(batch.leftOffDate().toString()).isEqualTo(date);
assertThat(batch.getLastContinue()).isEqualTo(aContinue);
ArgumentCaptor<Instant> argumentDate = ArgumentCaptor.forClass(Instant.class);
ArgumentCaptor<Continue> continueCaptor = ArgumentCaptor.forClass(Continue.class);
recentChanges.clear();
when(repository.fetchRecentChanges(argumentDate.capture(), continueCaptor.capture(), eq(batchSize))).thenReturn(result);
// check that poller passes the continue object to the next batch
batch = poller.nextBatch(batch);
assertThat(batch.changes()).hasSize(0);
assertThat(argumentDate.getValue().toString()).isEqualTo(date);
assertThat(continueCaptor.getValue()).isEqualTo(aContinue);
}
use of org.wikidata.query.rdf.tool.wikibase.Continue in project wikidata-query-rdf by wikimedia.
the class RecentChangesPollerUnitTest method dedups.
/**
* Check deduplication.
* Create 20 changes, of which each two are dupes,
* check that dupes are eliminated.
* @throws RetryableException
*/
@Test
@SuppressWarnings("unchecked")
public void dedups() throws RetryableException {
Instant startTime = Instant.now();
// Build a result from wikibase with duplicate recent changes
List<RecentChange> recentChanges = new ArrayList<>();
// 20 entries with 10 total Q ids
for (long i = 0; i < 20; i++) {
RecentChange rc = new RecentChange(0L, "Q" + (i / 2), Instant.now(), i, i, "edit");
recentChanges.add(rc);
}
Query query = new Query(recentChanges);
WikibaseApiError error = null;
Continue aContinue = null;
RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
firstBatchReturns(startTime, result);
RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
Batch batch = poller.firstBatch();
assertThat(batch.changes()).hasSize(10);
List<Change> changes = new ArrayList<>(batch.changes());
Collections.sort(changes, Comparator.comparing(Change::entityId));
for (int i = 0; i < 10; i++) {
assertThat(changes.get(i).entityId()).isEqualTo("Q" + i);
assertThat(changes.get(i).revision()).isEqualTo(2 * i + 1);
}
}
use of org.wikidata.query.rdf.tool.wikibase.Continue in project wikidata-query-rdf by wikimedia.
the class RecentChangesPollerUnitTest method backoffOverflow.
/**
* Backoff overflow check,
* Check that if we're backing off but find no new changes then time is advanced.
* @throws RetryableException
*/
@Test
public void backoffOverflow() throws RetryableException {
Instant startTime = Instant.now();
batchSize = 1;
RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
WikibaseApiError error = null;
Continue aContinue = null;
ArrayList<RecentChange> recentChanges = new ArrayList<>();
recentChanges.add(new RecentChange(0L, "Q424242", startTime, 42L, 42L, "edit"));
Query query = new Query(recentChanges);
RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
firstBatchReturns(startTime, result);
Batch batch = poller.firstBatch();
assertThat(batch.changes()).hasSize(1);
assertThat(batch.leftOffDate()).isEqualTo(startTime);
batch = poller.nextBatch(batch);
assertThat(batch.changes()).hasSize(0);
assertThat(startTime).isBefore(batch.leftOffDate());
assertThat(startTime.plusSeconds(1)).isEqualTo(batch.leftOffDate());
}
use of org.wikidata.query.rdf.tool.wikibase.Continue in project wikidata-query-rdf by wikimedia.
the class RecentChangesPollerUnitTest method backoffTime.
/**
* Check that recent requests use backoff.
* @throws RetryableException
*/
@Test
@SuppressWarnings("unchecked")
public void backoffTime() throws RetryableException {
Instant startTime = Instant.now();
RecentChangesPoller poller = new RecentChangesPoller(repository, startTime, batchSize, new MetricRegistry());
Instant nextStartTime = startTime.plusSeconds(20);
WikibaseApiError error = null;
Continue aContinue = null;
List<RecentChange> recentChanges = new ArrayList<>();
recentChanges.add(new RecentChange(0L, "Q424242", nextStartTime, 42L, 42L, "edit"));
Query query = new Query(recentChanges);
RecentChangeResponse result = new RecentChangeResponse(error, aContinue, query);
ArgumentCaptor<Instant> argument = ArgumentCaptor.forClass(Instant.class);
when(repository.fetchRecentChangesByTime(argument.capture(), eq(batchSize))).thenReturn(result);
when(repository.isEntityNamespace(0)).thenReturn(true);
when(repository.isValidEntity(any(String.class))).thenReturn(true);
Batch batch = poller.firstBatch();
// Ensure we backed off at least 7 seconds but no more than 20
assertThat(argument.getValue()).isBefore(startTime.minusSeconds(7));
assertThat(argument.getValue()).isAfter(startTime.minusSeconds(20));
// Verify that backoff still works on the second call
batch = poller.nextBatch(batch);
// verify we're still using fetchRecentChangesByTime
assertThat(batch).isNotNull();
assertThat(argument.getValue()).isBefore(nextStartTime.minusSeconds(7));
assertThat(argument.getValue()).isAfter(nextStartTime.minusSeconds(20));
}
Aggregations