Example 1 with DatasetResponseWithStatusCodeAndHeaders

use of in project webofneeds by researchstudio-sat.

the class CachingLinkedDataSource method fetchOnlyOnce.

 * We may run into fetching the same URI multiple times at once. Make sure we
 * make only one http request and use the response for every client.
 * @param resource
 * @param requesterWebID
 * @param linkedDataCacheEntry
 * @param headers
 * @return
private DatasetResponseWithStatusCodeAndHeaders fetchOnlyOnce(final URI resource, final URI requesterWebID, final LinkedDataCacheEntry linkedDataCacheEntry, final HttpHeaders headers) {
    String cacheKey = makeCacheKey(resource, requesterWebID);
    CountDownLatch latch = new CountDownLatch(1);
    CountDownLatch preExistingLatch = countDownLatchMap.putIfAbsent(cacheKey, latch);
    try {
        if (preExistingLatch != null) {
            logger.debug("resource " + cacheKey + " is being fetched in another thread, we wait for its result and use it " + "if it turns out to be cacheable");
            // in this case, another thread is already fetching the URI. Wait.
            try {
            } catch (InterruptedException e) {
                logger.warn("interrupted while waiting for another thread to fetch '" + resource + "'");
            // now, the other thread is done fetching the resource. It may not have been
            // allowed to cache it, in which case
            // we have to fetch it again. We try:
            Element element = cache.get(cacheKey);
            if (element != null) {
                logger.debug("resource " + cacheKey + " turned out to be cacheable, using it");
                // ok, we'll recreate a response from the cache.
                // Caution: this is not a copy, it's the SAME dataset - so manipulating the
                // result causes side-effects.
                LinkedDataCacheEntry entry = (LinkedDataCacheEntry) element.getObjectValue();
                return entry.recreateResponse();
            logger.debug("resource " + cacheKey + " did not turn out to be cacheable - fetching it, too");
        // so the cache still doesn't have it. We think it's better to let every thread
        // fetch it for itself.
        DatasetResponseWithStatusCodeAndHeaders datasetResponse = fetchAndCacheIfAppropriate(resource, requesterWebID, linkedDataCacheEntry, headers);
        return datasetResponse;
    } finally {
        // remove the latch from the map if it is in there
        countDownLatchMap.remove(cacheKey, latch);
        // wake up all threads that might now be waiting at our latch
Example 2 with DatasetResponseWithStatusCodeAndHeaders

use of in project webofneeds by researchstudio-sat.

the class CachingLinkedDataSource method fetchOrUseCached.

 * This method respects the headers 'Expires', 'Cache-Control', and 'ETAG': If a
 * cached resource (indicated by a non-null linkedDataCacheEntry) is expired
 * either according to the expiry date or the cache-control header from the
 * earlier request, the request will be made. When the request is made and an
 * ETAG value is known from an earlier request, it will be sent as the
 * 'If-None-Match' header value. In that case the server is expected to answer
 * with status 304 (not modified) and the cached response will be used, updating
 * cache control information if the server chooses to send 'Expires' or
 * 'Cache-Control' headers.
 * @param resource
 *            the URI of the resource to fetch
 * @param requesterWebID
 *            optional WebID URI to use for the request
 * @param linkedDataCacheEntry
 *            optional cache entry to use
 * @return
private DatasetResponseWithStatusCodeAndHeaders fetchOrUseCached(final URI resource, final URI requesterWebID, LinkedDataCacheEntry linkedDataCacheEntry) {
    // check
    // * if we have a cached result
    // * if we can use it
    // * make request, possibly using ETAG
    // * cache the new result if appropriate
    // * if ETAG indicates not modified, return cached result but update caching
    // info
    // * return result
    DatasetResponseWithStatusCodeAndHeaders responseData = null;
    HttpHeaders headers = new HttpHeaders();
    if (linkedDataCacheEntry != null) {
        Date now = new Date();
        // are allowed to do that:
        if (linkedDataCacheEntry.isExpiredAtDate(now)) {
            // cache item is expired. Remove from cache and fetch again
            cache.remove(makeCacheKey(resource, requesterWebID));
            logger.debug("cache item {} expired, fetching again.", resource);
            return fetchOnlyOnce(resource, requesterWebID, linkedDataCacheEntry, headers);
        if (linkedDataCacheEntry.getCacheControlFlags().contains(CacheControlFlag.PRIVATE) && isSharedCache()) {
            // in this case we assume that the response is not publicly visible, so it
            // depends on the specified
            // requesterWebID. The check is performed by the server. We cannot return a
            // cached response
            // immediately, but further down the line the ETAG based system can do that.
            logger.debug("cache item {} is Cache-Control:private and we are a shared cache. Will return cached copy only after server checks ETAG (and client cert), " + "therefore sending request to server.", resource);
            return fetchOnlyOnce(resource, requesterWebID, linkedDataCacheEntry, headers);
        logger.debug("returning cached version of {}", resource);
        // we can use the cached result directly
        return linkedDataCacheEntry.recreateResponse();
    // nothing found in the cache, fetch the resource remotely
    logger.debug("Nothing found in cache for {}, fetching remotely", resource);
    responseData = fetchOnlyOnce(resource, requesterWebID, null, headers);
    // inform the crawler callback
    if (crawlerCallback != null) {
        try {
            crawlerCallback.onDatasetCrawled(resource, responseData.getDataset());
        } catch (Exception e) {
  "error during callback execution for dataset %s", resource.toString()), e);
    return responseData;
Example 3 with DatasetResponseWithStatusCodeAndHeaders

use of in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method crawlUri.

private void crawlUri(CrawlUriMessage uriMsg) {
    Dataset ds = null;
    List<String> etags = null;
    Lock lock = null;
    try {
        // check if resource is already downloaded
        if (uriMsg instanceof ResourceCrawlUriMessage) {
            ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
            if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
                // TODO: this should be optimized, why deserialize the resource here when we just want to save it in the RDF
                // store? How to insert this serialized resource into the SPARQL endpoint?
                ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
        // download resource if not already downloaded
        if (ds == null) {
            // use ETag/If-None-Match Headers to make the process more efficient
            HttpHeaders httpHeaders = new HttpHeaders();
            if (uriMsg.getResourceETagHeaderValues() != null && !uriMsg.getResourceETagHeaderValues().isEmpty()) {
                String ifNoneMatchHeaderValue = StringUtils.collectionToDelimitedString(uriMsg.getResourceETagHeaderValues(), ", ");
                httpHeaders.add("If-None-Match", ifNoneMatchHeaderValue);
            DatasetResponseWithStatusCodeAndHeaders datasetWithHeaders = linkedDataSource.getDatasetWithHeadersForResource(URI.create(uriMsg.getUri()), httpHeaders);
            ds = datasetWithHeaders.getDataset();
            etags = datasetWithHeaders.getResponseHeaders().get("ETag");
            // if dataset was not modified (304) we can treat the current crawl uri as done
            if (ds == null && datasetWithHeaders.getStatusCode() == 304) {
                sendDoneUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
            // if there is paging activated and the won node tells us that there is more data (previous link)
            // to be downloaded, then we add this link to the crawling process too
            String prevLink = linkedDataSource.getPreviousLinkFromDatasetWithHeaders(datasetWithHeaders);
            if (prevLink != null) {
                CrawlUriMessage newUriMsg = new CrawlUriMessage(uriMsg.getBaseUri(), prevLink, uriMsg.getWonNodeUri(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null);
                getSender().tell(newUriMsg, getSelf());
        lock = ds == null ? null : ds.getLock();
        // Save dataset to triple store
        String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
        if (wonNodeUri == null) {
            wonNodeUri = uriMsg.getWonNodeUri();
        // do nothing more here if the STATUS of the message was SAVE
        if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
            log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
        // extract URIs from current resource and send extracted URI messages back to sender
        log.debug("Extract URIs from message {}", uriMsg);
        Set<CrawlUriMessage> newCrawlMessages = sparqlService.extractCrawlUriMessages(uriMsg.getBaseUri(), wonNodeUri);
        for (CrawlUriMessage newMsg : newCrawlMessages) {
            getSender().tell(newMsg, getSelf());
        // signal sender that this URI is processed and save meta data about crawling the URI.
        // This needs to be done after all extracted URI messages have been sent to guarantee consistency
        // in case of failure
        sendDoneUriMessage(uriMsg, wonNodeUri, etags);
        // if this URI/dataset was a need then send an event to the distributed event bu
        if (NeedModelWrapper.isANeed(ds)) {
            NeedModelWrapper needModelWrapper = new NeedModelWrapper(ds, false);
            NeedState state = needModelWrapper.getNeedState();
            NeedEvent.TYPE type = state.equals(NeedState.ACTIVE) ? NeedEvent.TYPE.ACTIVE : NeedEvent.TYPE.INACTIVE;
            log.debug("Created need event for need uri {}", uriMsg.getUri());
            long crawlDate = System.currentTimeMillis();
            NeedEvent needEvent = new NeedEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds);
            pubSubMediator.tell(new DistributedPubSubMediator.Publish(needEvent.getClass().getName(), needEvent), getSelf());
    } catch (RestClientException e1) {
        // usually happens if the fetch of the dataset fails e.g. HttpServerErrorException, HttpClientErrorException
        log.debug("Exception during crawling: " + e1);
        throw new CrawlWrapperException(e1, uriMsg);
    } catch (Exception e) {
        log.debug("Exception during crawling: " + e);
        throw new CrawlWrapperException(e, uriMsg);
    } finally {
        if (lock != null) {
Example 4 with DatasetResponseWithStatusCodeAndHeaders

use of in project webofneeds by researchstudio-sat.

the class CachingLinkedDataSource method fetch.

 * Performs the actual request via the linkedDataRestClient.
 * @param resource
 * @param requesterWebID
 * @param headers
 * @return
private DatasetResponseWithStatusCodeAndHeaders fetch(final URI resource, final URI requesterWebID, final HttpHeaders headers) {
    final DatasetResponseWithStatusCodeAndHeaders responseData;
    if (requesterWebID != null) {
        logger.debug("fetching linked data for URI {} with WebID {}", resource, requesterWebID);
        responseData = linkedDataRestClient.readResourceDataWithHeaders(resource, requesterWebID, headers);
        if (logger.isDebugEnabled()) {
            logger.debug("fetched resource {} with requesterWebID {}: ", resource, requesterWebID);
            RDFDataMgr.write(System.out, responseData.getDataset(), Lang.TRIG);
    } else {
        logger.debug("fetching linked data for URI {} without WebID", resource, requesterWebID);
        responseData = linkedDataRestClient.readResourceDataWithHeaders(resource, headers);
        if (logger.isDebugEnabled()) {
            logger.debug("fetched resource {} without requesterWebID:", resource, requesterWebID);
            RDFDataMgr.write(System.out, responseData.getDataset(), Lang.TRIG);
    return responseData;
Example 5 with DatasetResponseWithStatusCodeAndHeaders

use of in project webofneeds by researchstudio-sat.

the class CachingLinkedDataSource method fetchWithEtagValidation.

 * Checks if the cached entry has an ETAG value set and uses the 'If-None-Match'
 * header if this is the case. If the server responds with 304 - NOT_MODIFIED,
 * the cached dataset replaces the (empty) dataset coming from the server in the
 * DatasetResponseWithStatusCodeAndHeaders.
 * @param resource
 * @param requesterWebID
 * @param linkedDataCacheEntry
 * @param headers
 * @return
private DatasetResponseWithStatusCodeAndHeaders fetchWithEtagValidation(final URI resource, final URI requesterWebID, final LinkedDataCacheEntry linkedDataCacheEntry, final HttpHeaders headers) {
    if (linkedDataCacheEntry == null || linkedDataCacheEntry.getEtag() == null) {
        logger.debug("fetching from server without ETAG validation: {} ", resource);
        return fetch(resource, requesterWebID, headers);
    // we already have an etag - use it for validating
    HttpHeaders myHeaders = headers != null ? headers : new HttpHeaders();
    myHeaders.add(HttpHeaders.IF_NONE_MATCH, linkedDataCacheEntry.getEtag());
    logger.debug("fetching from server with ETAG validation: {} ", resource);
    DatasetResponseWithStatusCodeAndHeaders datasetResponse = fetch(resource, requesterWebID, myHeaders);
    if (datasetResponse.getStatusCode() == HttpStatus.NOT_MODIFIED.value()) {
        // replace dataset in response with the cached dataset
        logger.debug("server said our ETAG is still valid, using cached dataset for URI {} ", resource);
        datasetResponse = new DatasetResponseWithStatusCodeAndHeaders(readDatasetFromByteArray(linkedDataCacheEntry.getDataset()), datasetResponse.getStatusCode(), datasetResponse.getResponseHeaders());
    } else {
        logger.debug("server said our ETAG is not valid, not using cached result for URI {} ", resource);
    // We would like to remove the item from the cache immediately because it is now
    // outdated. However, we cannot
    // remove the cached result from the cache here because we may have gotten any
    // response from the
    // server (i.e. 1xx, 2xx, 3xx, 4xx, 5xx). However, if the ETAG isn't valid,
    // we'll overwrite the cache entry down
    // the line or remove it if the server decides to forbid caching.
    return datasetResponse;
