use of won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders in project webofneeds by researchstudio-sat.
the class CachingLinkedDataSource method fetchOnlyOnce.
/**
* We may run into fetching the same URI multiple times at once. Make sure we
* make only one http request and use the response for every client.
*
* @param resource
* @param requesterWebID
* @param linkedDataCacheEntry
* @param headers
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetchOnlyOnce(final URI resource, final URI requesterWebID, final LinkedDataCacheEntry linkedDataCacheEntry, final HttpHeaders headers) {
String cacheKey = makeCacheKey(resource, requesterWebID);
CountDownLatch latch = new CountDownLatch(1);
CountDownLatch preExistingLatch = countDownLatchMap.putIfAbsent(cacheKey, latch);
try {
if (preExistingLatch != null) {
logger.debug("resource " + cacheKey + " is being fetched in another thread, we wait for its result and use it " + "if it turns out to be cacheable");
// in this case, another thread is already fetching the URI. Wait.
try {
preExistingLatch.await();
} catch (InterruptedException e) {
logger.warn("interrupted while waiting for another thread to fetch '" + resource + "'");
}
// now, the other thread is done fetching the resource. It may not have been
// allowed to cache it, in which case
// we have to fetch it again. We try:
Element element = cache.get(cacheKey);
if (element != null) {
logger.debug("resource " + cacheKey + " turned out to be cacheable, using it");
// ok, we'll recreate a response from the cache.
// Caution: this is not a copy, it's the SAME dataset - so manipulating the
// result causes side-effects.
LinkedDataCacheEntry entry = (LinkedDataCacheEntry) element.getObjectValue();
return entry.recreateResponse();
}
logger.debug("resource " + cacheKey + " did not turn out to be cacheable - fetching it, too");
// so the cache still doesn't have it. We think it's better to let every thread
// fetch it for itself.
}
DatasetResponseWithStatusCodeAndHeaders datasetResponse = fetchAndCacheIfAppropriate(resource, requesterWebID, linkedDataCacheEntry, headers);
return datasetResponse;
} finally {
// remove the latch from the map if it is in there
countDownLatchMap.remove(cacheKey, latch);
// wake up all threads that might now be waiting at our latch
latch.countDown();
}
}
use of won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders in project webofneeds by researchstudio-sat.
the class CachingLinkedDataSource method fetchOrUseCached.
/**
* This method respects the headers 'Expires', 'Cache-Control', and 'ETAG': If a
* cached resource (indicated by a non-null linkedDataCacheEntry) is expired
* either according to the expiry date or the cache-control header from the
* earlier request, the request will be made. When the request is made and an
* ETAG value is known from an earlier request, it will be sent as the
* 'If-None-Match' header value. In that case the server is expected to answer
* with status 304 (not modified) and the cached response will be used, updating
* cache control information if the server chooses to send 'Expires' or
* 'Cache-Control' headers.
*
* @param resource
* the URI of the resource to fetch
* @param requesterWebID
* optional WebID URI to use for the request
* @param linkedDataCacheEntry
* optional cache entry to use
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetchOrUseCached(final URI resource, final URI requesterWebID, LinkedDataCacheEntry linkedDataCacheEntry) {
// check
// * if we have a cached result
// * if we can use it
// * make request, possibly using ETAG
// * cache the new result if appropriate
// * if ETAG indicates not modified, return cached result but update caching
// info
// * return result
DatasetResponseWithStatusCodeAndHeaders responseData = null;
HttpHeaders headers = new HttpHeaders();
if (linkedDataCacheEntry != null) {
Date now = new Date();
// are allowed to do that:
if (linkedDataCacheEntry.isExpiredAtDate(now)) {
// cache item is expired. Remove from cache and fetch again
cache.remove(makeCacheKey(resource, requesterWebID));
logger.debug("cache item {} expired, fetching again.", resource);
return fetchOnlyOnce(resource, requesterWebID, linkedDataCacheEntry, headers);
}
if (linkedDataCacheEntry.getCacheControlFlags().contains(CacheControlFlag.PRIVATE) && isSharedCache()) {
// in this case we assume that the response is not publicly visible, so it
// depends on the specified
// requesterWebID. The check is performed by the server. We cannot return a
// cached response
// immediately, but further down the line the ETAG based system can do that.
logger.debug("cache item {} is Cache-Control:private and we are a shared cache. Will return cached copy only after server checks ETAG (and client cert), " + "therefore sending request to server.", resource);
return fetchOnlyOnce(resource, requesterWebID, linkedDataCacheEntry, headers);
}
logger.debug("returning cached version of {}", resource);
// we can use the cached result directly
return linkedDataCacheEntry.recreateResponse();
}
// nothing found in the cache, fetch the resource remotely
logger.debug("Nothing found in cache for {}, fetching remotely", resource);
responseData = fetchOnlyOnce(resource, requesterWebID, null, headers);
// inform the crawler callback
if (crawlerCallback != null) {
try {
crawlerCallback.onDatasetCrawled(resource, responseData.getDataset());
} catch (Exception e) {
logger.info(String.format("error during callback execution for dataset %s", resource.toString()), e);
}
}
return responseData;
}
use of won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders in project webofneeds by researchstudio-sat.
the class WorkerCrawlerActor method crawlUri.
private void crawlUri(CrawlUriMessage uriMsg) {
Dataset ds = null;
List<String> etags = null;
Lock lock = null;
try {
// check if resource is already downloaded
if (uriMsg instanceof ResourceCrawlUriMessage) {
ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
// TODO: this should be optimized, why deserialize the resource here when we just want to save it in the RDF
// store? How to insert this serialized resource into the SPARQL endpoint?
ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
}
}
// download resource if not already downloaded
if (ds == null) {
// use ETag/If-None-Match Headers to make the process more efficient
HttpHeaders httpHeaders = new HttpHeaders();
if (uriMsg.getResourceETagHeaderValues() != null && !uriMsg.getResourceETagHeaderValues().isEmpty()) {
String ifNoneMatchHeaderValue = StringUtils.collectionToDelimitedString(uriMsg.getResourceETagHeaderValues(), ", ");
httpHeaders.add("If-None-Match", ifNoneMatchHeaderValue);
}
DatasetResponseWithStatusCodeAndHeaders datasetWithHeaders = linkedDataSource.getDatasetWithHeadersForResource(URI.create(uriMsg.getUri()), httpHeaders);
ds = datasetWithHeaders.getDataset();
etags = datasetWithHeaders.getResponseHeaders().get("ETag");
// if dataset was not modified (304) we can treat the current crawl uri as done
if (ds == null && datasetWithHeaders.getStatusCode() == 304) {
sendDoneUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
return;
}
// if there is paging activated and the won node tells us that there is more data (previous link)
// to be downloaded, then we add this link to the crawling process too
String prevLink = linkedDataSource.getPreviousLinkFromDatasetWithHeaders(datasetWithHeaders);
if (prevLink != null) {
CrawlUriMessage newUriMsg = new CrawlUriMessage(uriMsg.getBaseUri(), prevLink, uriMsg.getWonNodeUri(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null);
getSender().tell(newUriMsg, getSelf());
}
}
lock = ds == null ? null : ds.getLock();
lock.enterCriticalSection(true);
// Save dataset to triple store
sparqlService.updateNamedGraphsOfDataset(ds);
String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
if (wonNodeUri == null) {
wonNodeUri = uriMsg.getWonNodeUri();
}
// do nothing more here if the STATUS of the message was SAVE
if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
return;
}
// extract URIs from current resource and send extracted URI messages back to sender
log.debug("Extract URIs from message {}", uriMsg);
Set<CrawlUriMessage> newCrawlMessages = sparqlService.extractCrawlUriMessages(uriMsg.getBaseUri(), wonNodeUri);
for (CrawlUriMessage newMsg : newCrawlMessages) {
getSender().tell(newMsg, getSelf());
}
// signal sender that this URI is processed and save meta data about crawling the URI.
// This needs to be done after all extracted URI messages have been sent to guarantee consistency
// in case of failure
sendDoneUriMessage(uriMsg, wonNodeUri, etags);
// if this URI/dataset was a need then send an event to the distributed event bu
if (NeedModelWrapper.isANeed(ds)) {
NeedModelWrapper needModelWrapper = new NeedModelWrapper(ds, false);
NeedState state = needModelWrapper.getNeedState();
NeedEvent.TYPE type = state.equals(NeedState.ACTIVE) ? NeedEvent.TYPE.ACTIVE : NeedEvent.TYPE.INACTIVE;
log.debug("Created need event for need uri {}", uriMsg.getUri());
long crawlDate = System.currentTimeMillis();
NeedEvent needEvent = new NeedEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds);
pubSubMediator.tell(new DistributedPubSubMediator.Publish(needEvent.getClass().getName(), needEvent), getSelf());
}
} catch (RestClientException e1) {
// usually happens if the fetch of the dataset fails e.g. HttpServerErrorException, HttpClientErrorException
log.debug("Exception during crawling: " + e1);
throw new CrawlWrapperException(e1, uriMsg);
} catch (Exception e) {
log.debug("Exception during crawling: " + e);
throw new CrawlWrapperException(e, uriMsg);
} finally {
if (lock != null) {
lock.leaveCriticalSection();
}
}
}
use of won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders in project webofneeds by researchstudio-sat.
the class CachingLinkedDataSource method fetch.
/**
* Performs the actual request via the linkedDataRestClient.
*
* @param resource
* @param requesterWebID
* @param headers
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetch(final URI resource, final URI requesterWebID, final HttpHeaders headers) {
final DatasetResponseWithStatusCodeAndHeaders responseData;
if (requesterWebID != null) {
logger.debug("fetching linked data for URI {} with WebID {}", resource, requesterWebID);
responseData = linkedDataRestClient.readResourceDataWithHeaders(resource, requesterWebID, headers);
if (logger.isDebugEnabled()) {
logger.debug("fetched resource {} with requesterWebID {}: ", resource, requesterWebID);
RDFDataMgr.write(System.out, responseData.getDataset(), Lang.TRIG);
}
} else {
logger.debug("fetching linked data for URI {} without WebID", resource, requesterWebID);
responseData = linkedDataRestClient.readResourceDataWithHeaders(resource, headers);
if (logger.isDebugEnabled()) {
logger.debug("fetched resource {} without requesterWebID:", resource, requesterWebID);
RDFDataMgr.write(System.out, responseData.getDataset(), Lang.TRIG);
}
}
return responseData;
}
use of won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders in project webofneeds by researchstudio-sat.
the class CachingLinkedDataSource method fetchWithEtagValidation.
/**
* Checks if the cached entry has an ETAG value set and uses the 'If-None-Match'
* header if this is the case. If the server responds with 304 - NOT_MODIFIED,
* the cached dataset replaces the (empty) dataset coming from the server in the
* DatasetResponseWithStatusCodeAndHeaders.
*
* @param resource
* @param requesterWebID
* @param linkedDataCacheEntry
* @param headers
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetchWithEtagValidation(final URI resource, final URI requesterWebID, final LinkedDataCacheEntry linkedDataCacheEntry, final HttpHeaders headers) {
if (linkedDataCacheEntry == null || linkedDataCacheEntry.getEtag() == null) {
logger.debug("fetching from server without ETAG validation: {} ", resource);
return fetch(resource, requesterWebID, headers);
}
// we already have an etag - use it for validating
HttpHeaders myHeaders = headers != null ? headers : new HttpHeaders();
myHeaders.add(HttpHeaders.IF_NONE_MATCH, linkedDataCacheEntry.getEtag());
logger.debug("fetching from server with ETAG validation: {} ", resource);
DatasetResponseWithStatusCodeAndHeaders datasetResponse = fetch(resource, requesterWebID, myHeaders);
if (datasetResponse.getStatusCode() == HttpStatus.NOT_MODIFIED.value()) {
// replace dataset in response with the cached dataset
logger.debug("server said our ETAG is still valid, using cached dataset for URI {} ", resource);
datasetResponse = new DatasetResponseWithStatusCodeAndHeaders(readDatasetFromByteArray(linkedDataCacheEntry.getDataset()), datasetResponse.getStatusCode(), datasetResponse.getResponseHeaders());
} else {
logger.debug("server said our ETAG is not valid, not using cached result for URI {} ", resource);
// We would like to remove the item from the cache immediately because it is now
// outdated. However, we cannot
// remove the cached result from the cache here because we may have gotten any
// response from the
// server (i.e. 1xx, 2xx, 3xx, 4xx, 5xx). However, if the ETAG isn't valid,
// we'll overwrite the cache entry down
// the line or remove it if the server decides to forbid caching.
}
return datasetResponse;
}
Aggregations