Search in sources :

Example 1 with GoogleWebmasterFilter.countryFilterToString

use of org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString in project incubator-gobblin by apache.

the class GoogleWebmasterDataFetcherImpl method getAllPages.

/**
 * Due to the limitation of the API, we can get a maximum of 5000 rows at a time. Another limitation is that, results are sorted by click count descending. If two rows have the same click count, they are sorted in an arbitrary way. (Read more at https://developers.google.com/webmaster-tools/v3/searchanalytics). So we try to get all pages by partitions, if a partition has 5000 rows returned. We try partition current partition into more granular levels.
 */
@Override
public Collection<ProducerJob> getAllPages(String startDate, String endDate, String country, int rowLimit) throws IOException {
    log.info("Requested row limit: " + rowLimit);
    if (!_jobs.isEmpty()) {
        log.info("Service got hot started.");
        return _jobs;
    }
    ApiDimensionFilter countryFilter = GoogleWebmasterFilter.countryEqFilter(country);
    List<GoogleWebmasterFilter.Dimension> requestedDimensions = new ArrayList<>();
    requestedDimensions.add(GoogleWebmasterFilter.Dimension.PAGE);
    int expectedSize = -1;
    if (rowLimit >= GoogleWebmasterClient.API_ROW_LIMIT) {
        // expected size only makes sense when the data set size is larger than GoogleWebmasterClient.API_ROW_LIMIT
        expectedSize = getPagesSize(startDate, endDate, country, requestedDimensions, Arrays.asList(countryFilter));
        log.info(String.format("Expected number of pages is %d for market-%s from %s to %s", expectedSize, GoogleWebmasterFilter.countryFilterToString(countryFilter), startDate, endDate));
    }
    Queue<Pair<String, FilterOperator>> jobs = new ArrayDeque<>();
    jobs.add(Pair.of(_siteProperty, FilterOperator.CONTAINS));
    Collection<String> allPages = getPages(startDate, endDate, requestedDimensions, countryFilter, jobs, Math.min(rowLimit, GoogleWebmasterClient.API_ROW_LIMIT));
    int actualSize = allPages.size();
    log.info(String.format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize, _siteProperty, country, startDate, endDate));
    if (expectedSize != -1 && actualSize != expectedSize) {
        log.warn(String.format("Expected page size is %d, but only able to get %d", expectedSize, actualSize));
    }
    ArrayDeque<ProducerJob> producerJobs = new ArrayDeque<>(actualSize);
    for (String page : allPages) {
        producerJobs.add(new SimpleProducerJob(page, startDate, endDate));
    }
    return producerJobs;
}
Also used : ApiDimensionFilter(com.google.api.services.webmasters.model.ApiDimensionFilter) ArrayList(java.util.ArrayList) Dimension(org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.Dimension) GoogleWebmasterFilter.countryFilterToString(org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString) ArrayDeque(java.util.ArrayDeque) Pair(org.apache.commons.lang3.tuple.Pair)

Example 2 with GoogleWebmasterFilter.countryFilterToString

use of org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString in project incubator-gobblin by apache.

the class GoogleWebmasterDataFetcherImpl method getPages.

/**
 * Get all pages in an async mode.
 */
private Collection<String> getPages(String startDate, String endDate, List<Dimension> dimensions, ApiDimensionFilter countryFilter, Queue<Pair<String, FilterOperator>> toProcess, int rowLimit) throws IOException {
    String country = GoogleWebmasterFilter.countryFilterToString(countryFilter);
    ConcurrentLinkedDeque<String> allPages = new ConcurrentLinkedDeque<>();
    int r = 0;
    while (r <= GET_PAGES_RETRIES) {
        ++r;
        log.info(String.format("Get pages at round %d with size %d.", r, toProcess.size()));
        ConcurrentLinkedDeque<Pair<String, FilterOperator>> nextRound = new ConcurrentLinkedDeque<>();
        ExecutorService es = Executors.newFixedThreadPool(10, ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of(this.getClass().getSimpleName())));
        while (!toProcess.isEmpty()) {
            submitJob(toProcess.poll(), countryFilter, startDate, endDate, dimensions, es, allPages, nextRound, rowLimit);
        }
        // wait for jobs to finish and start next round if necessary.
        try {
            es.shutdown();
            boolean terminated = es.awaitTermination(5, TimeUnit.MINUTES);
            if (!terminated) {
                es.shutdownNow();
                log.warn(String.format("Timed out while getting all pages for country-%s at round %d. Next round now has size %d.", country, r, nextRound.size()));
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        if (nextRound.isEmpty()) {
            break;
        }
        toProcess = nextRound;
    }
    if (r == GET_PAGES_RETRIES) {
        throw new RuntimeException(String.format("Getting all pages reaches the maximum number of retires %d. Date range: %s ~ %s. Country: %s.", GET_PAGES_RETRIES, startDate, endDate, country));
    }
    return allPages;
}
Also used : ExecutorService(java.util.concurrent.ExecutorService) GoogleWebmasterFilter.countryFilterToString(org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString) ConcurrentLinkedDeque(java.util.concurrent.ConcurrentLinkedDeque) Pair(org.apache.commons.lang3.tuple.Pair)

Aggregations

Pair (org.apache.commons.lang3.tuple.Pair)2 GoogleWebmasterFilter.countryFilterToString (org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString)2 ApiDimensionFilter (com.google.api.services.webmasters.model.ApiDimensionFilter)1 ArrayDeque (java.util.ArrayDeque)1 ArrayList (java.util.ArrayList)1 ConcurrentLinkedDeque (java.util.concurrent.ConcurrentLinkedDeque)1 ExecutorService (java.util.concurrent.ExecutorService)1 Dimension (org.apache.gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.Dimension)1