use of org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString in project incubator-gobblin by apache.
the class GoogleWebmasterDataFetcherImpl method getAllPages.
/**
* Due to the limitation of the API, we can get a maximum of 5000 rows at a time. Another limitation is that, results are sorted by click count descending. If two rows have the same click count, they are sorted in an arbitrary way. (Read more at https://developers.google.com/webmaster-tools/v3/searchanalytics). So we try to get all pages by partitions, if a partition has 5000 rows returned. We try partition current partition into more granular levels.
*/
@Override
public Collection<ProducerJob> getAllPages(String startDate, String endDate, String country, int rowLimit) throws IOException {
log.info("Requested row limit: " + rowLimit);
if (!_jobs.isEmpty()) {
log.info("Service got hot started.");
return _jobs;
}
ApiDimensionFilter countryFilter = GoogleWebmasterFilter.countryEqFilter(country);
List<GoogleWebmasterFilter.Dimension> requestedDimensions = new ArrayList<>();
requestedDimensions.add(GoogleWebmasterFilter.Dimension.PAGE);
int expectedSize = -1;
if (rowLimit >= GoogleWebmasterClient.API_ROW_LIMIT) {
// expected size only makes sense when the data set size is larger than GoogleWebmasterClient.API_ROW_LIMIT
expectedSize = getPagesSize(startDate, endDate, country, requestedDimensions, Arrays.asList(countryFilter));
log.info(String.format("Expected number of pages is %d for market-%s from %s to %s", expectedSize, GoogleWebmasterFilter.countryFilterToString(countryFilter), startDate, endDate));
}
Queue<Pair<String, FilterOperator>> jobs = new ArrayDeque<>();
jobs.add(Pair.of(_siteProperty, FilterOperator.CONTAINS));
Collection<String> allPages = getPages(startDate, endDate, requestedDimensions, countryFilter, jobs, Math.min(rowLimit, GoogleWebmasterClient.API_ROW_LIMIT));
int actualSize = allPages.size();
log.info(String.format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize, _siteProperty, country, startDate, endDate));
if (expectedSize != -1 && actualSize != expectedSize) {
log.warn(String.format("Expected page size is %d, but only able to get %d", expectedSize, actualSize));
}
ArrayDeque<ProducerJob> producerJobs = new ArrayDeque<>(actualSize);
for (String page : allPages) {
producerJobs.add(new SimpleProducerJob(page, startDate, endDate));
}
return producerJobs;
}
use of org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString in project incubator-gobblin by apache.
the class GoogleWebmasterDataFetcherImpl method getPages.
/**
* Get all pages in an async mode.
*/
private Collection<String> getPages(String startDate, String endDate, List<Dimension> dimensions, ApiDimensionFilter countryFilter, Queue<Pair<String, FilterOperator>> toProcess, int rowLimit) throws IOException {
String country = GoogleWebmasterFilter.countryFilterToString(countryFilter);
ConcurrentLinkedDeque<String> allPages = new ConcurrentLinkedDeque<>();
int r = 0;
while (r <= GET_PAGES_RETRIES) {
++r;
log.info(String.format("Get pages at round %d with size %d.", r, toProcess.size()));
ConcurrentLinkedDeque<Pair<String, FilterOperator>> nextRound = new ConcurrentLinkedDeque<>();
ExecutorService es = Executors.newFixedThreadPool(10, ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of(this.getClass().getSimpleName())));
while (!toProcess.isEmpty()) {
submitJob(toProcess.poll(), countryFilter, startDate, endDate, dimensions, es, allPages, nextRound, rowLimit);
}
// wait for jobs to finish and start next round if necessary.
try {
es.shutdown();
boolean terminated = es.awaitTermination(5, TimeUnit.MINUTES);
if (!terminated) {
es.shutdownNow();
log.warn(String.format("Timed out while getting all pages for country-%s at round %d. Next round now has size %d.", country, r, nextRound.size()));
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
if (nextRound.isEmpty()) {
break;
}
toProcess = nextRound;
}
if (r == GET_PAGES_RETRIES) {
throw new RuntimeException(String.format("Getting all pages reaches the maximum number of retires %d. Date range: %s ~ %s. Country: %s.", GET_PAGES_RETRIES, startDate, endDate, country));
}
return allPages;
}
Aggregations