Search in sources :

Example 6 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method crawlUri.

private void crawlUri(CrawlUriMessage uriMsg) {
    Dataset ds = null;
    List<String> etags = null;
    Lock lock = null;
    try {
        // check if resource is already downloaded
        if (uriMsg instanceof ResourceCrawlUriMessage) {
            ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
            if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
                // TODO: this should be optimized, why deserialize the resource here when we just want to save it in the RDF
                // store? How to insert this serialized resource into the SPARQL endpoint?
                ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
            }
        }
        // download resource if not already downloaded
        if (ds == null) {
            // use ETag/If-None-Match Headers to make the process more efficient
            HttpHeaders httpHeaders = new HttpHeaders();
            if (uriMsg.getResourceETagHeaderValues() != null && !uriMsg.getResourceETagHeaderValues().isEmpty()) {
                String ifNoneMatchHeaderValue = StringUtils.collectionToDelimitedString(uriMsg.getResourceETagHeaderValues(), ", ");
                httpHeaders.add("If-None-Match", ifNoneMatchHeaderValue);
            }
            DatasetResponseWithStatusCodeAndHeaders datasetWithHeaders = linkedDataSource.getDatasetWithHeadersForResource(URI.create(uriMsg.getUri()), httpHeaders);
            ds = datasetWithHeaders.getDataset();
            etags = datasetWithHeaders.getResponseHeaders().get("ETag");
            // if dataset was not modified (304) we can treat the current crawl uri as done
            if (ds == null && datasetWithHeaders.getStatusCode() == 304) {
                sendDoneUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
                return;
            }
            // if there is paging activated and the won node tells us that there is more data (previous link)
            // to be downloaded, then we add this link to the crawling process too
            String prevLink = linkedDataSource.getPreviousLinkFromDatasetWithHeaders(datasetWithHeaders);
            if (prevLink != null) {
                CrawlUriMessage newUriMsg = new CrawlUriMessage(uriMsg.getBaseUri(), prevLink, uriMsg.getWonNodeUri(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null);
                getSender().tell(newUriMsg, getSelf());
            }
        }
        lock = ds == null ? null : ds.getLock();
        lock.enterCriticalSection(true);
        // Save dataset to triple store
        sparqlService.updateNamedGraphsOfDataset(ds);
        String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
        if (wonNodeUri == null) {
            wonNodeUri = uriMsg.getWonNodeUri();
        }
        // do nothing more here if the STATUS of the message was SAVE
        if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
            log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
            return;
        }
        // extract URIs from current resource and send extracted URI messages back to sender
        log.debug("Extract URIs from message {}", uriMsg);
        Set<CrawlUriMessage> newCrawlMessages = sparqlService.extractCrawlUriMessages(uriMsg.getBaseUri(), wonNodeUri);
        for (CrawlUriMessage newMsg : newCrawlMessages) {
            getSender().tell(newMsg, getSelf());
        }
        // signal sender that this URI is processed and save meta data about crawling the URI.
        // This needs to be done after all extracted URI messages have been sent to guarantee consistency
        // in case of failure
        sendDoneUriMessage(uriMsg, wonNodeUri, etags);
        // if this URI/dataset was a need then send an event to the distributed event bu
        if (NeedModelWrapper.isANeed(ds)) {
            NeedModelWrapper needModelWrapper = new NeedModelWrapper(ds, false);
            NeedState state = needModelWrapper.getNeedState();
            NeedEvent.TYPE type = state.equals(NeedState.ACTIVE) ? NeedEvent.TYPE.ACTIVE : NeedEvent.TYPE.INACTIVE;
            log.debug("Created need event for need uri {}", uriMsg.getUri());
            long crawlDate = System.currentTimeMillis();
            NeedEvent needEvent = new NeedEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds);
            pubSubMediator.tell(new DistributedPubSubMediator.Publish(needEvent.getClass().getName(), needEvent), getSelf());
        }
    } catch (RestClientException e1) {
        // usually happens if the fetch of the dataset fails e.g. HttpServerErrorException, HttpClientErrorException
        log.debug("Exception during crawling: " + e1);
        throw new CrawlWrapperException(e1, uriMsg);
    } catch (Exception e) {
        log.debug("Exception during crawling: " + e);
        throw new CrawlWrapperException(e, uriMsg);
    } finally {
        if (lock != null) {
            lock.leaveCriticalSection();
        }
    }
}
Also used : HttpHeaders(org.springframework.http.HttpHeaders) CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) DatasetResponseWithStatusCodeAndHeaders(won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders) DistributedPubSubMediator(akka.cluster.pubsub.DistributedPubSubMediator) Dataset(org.apache.jena.query.Dataset) NeedState(won.protocol.model.NeedState) NeedModelWrapper(won.protocol.util.NeedModelWrapper) NeedEvent(won.matcher.service.common.event.NeedEvent) CrawlWrapperException(won.matcher.service.crawler.exception.CrawlWrapperException) CrawlWrapperException(won.matcher.service.crawler.exception.CrawlWrapperException) IncorrectPropertyCountException(won.protocol.exception.IncorrectPropertyCountException) RestClientException(org.springframework.web.client.RestClientException) Lock(org.apache.jena.shared.Lock) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) RestClientException(org.springframework.web.client.RestClientException)

Example 7 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class MasterCrawlerActor method startCrawling.

/**
 * Start crawling a won node starting at the need list
 *
 * @param wonNodeInfo
 */
private void startCrawling(WonNodeInfo wonNodeInfo) {
    // get the last known need modification date and start crawling from this point again
    log.info("start crawling won node: {} ...", wonNodeInfo.getWonNodeURI());
    String lastNeedModificationDate = sparqlService.retrieveNeedModificationDateForCrawling(wonNodeInfo.getWonNodeURI());
    if (lastNeedModificationDate != null) {
        String needListUri = removeEndingSlash(wonNodeInfo.getNeedListURI());
        String modifiedUri = needListUri + "?modifiedafter=" + lastNeedModificationDate;
        self().tell(new CrawlUriMessage(modifiedUri, needListUri, wonNodeInfo.getWonNodeURI(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null), getSelf());
    } else {
        // or else if we didn't crawl needs yet start crawling the whole won node
        String needListUri = removeEndingSlash(wonNodeInfo.getNeedListURI());
        self().tell(new CrawlUriMessage(needListUri, needListUri, wonNodeInfo.getWonNodeURI(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null), getSelf());
    }
    // get the last known connection modification date and start crawling from this point again
    String lastConnectionModificationDate = sparqlService.retrieveConnectionModificationDateForCrawling(wonNodeInfo.getWonNodeURI());
    if (lastConnectionModificationDate != null) {
        String connectionPrefixUri = removeEndingSlash(wonNodeInfo.getConnectionURIPrefix());
        String modifiedUri = connectionPrefixUri + "?modifiedafter=" + lastConnectionModificationDate;
        self().tell(new CrawlUriMessage(modifiedUri, connectionPrefixUri, wonNodeInfo.getWonNodeURI(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null), getSelf());
    }
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage)

Example 8 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class UpdateMetadataActor method onReceive.

/**
 * Collects messages until the maximum bulk update size is reached or a timer is
 * elapsed to execute the meta data bulk update.
 *
 * @param message
 */
@Override
public void onReceive(final Object message) {
    if (message instanceof CrawlUriMessage) {
        CrawlUriMessage uriMsg = (CrawlUriMessage) message;
        log.debug("Add message to bulk update list: {}", uriMsg);
        bulkMessages.add(uriMsg);
        if (bulkMessages.size() >= config.getMetaDataUpdateMaxBulkSize()) {
            update();
        }
    } else if (message instanceof String) {
        update();
    } else {
        unhandled(message);
    }
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage)

Example 9 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method onReceive.

/**
 * Receives messages with an URI and processes them by requesting the resource,
 * saving it to a triple store, extracting URIs from content and answering the sender.
 *
 * @param msg if type is {@link CrawlUriMessage} then process it
 */
@Override
public void onReceive(Object msg) throws RestClientException {
    if (!(msg instanceof CrawlUriMessage)) {
        unhandled(msg);
        return;
    }
    CrawlUriMessage uriMsg = (CrawlUriMessage) msg;
    if (!uriMsg.getStatus().equals(CrawlUriMessage.STATUS.PROCESS) && !uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
        unhandled(msg);
        return;
    }
    crawlUri(uriMsg);
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage)

Example 10 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method sendDoneUriMessage.

private void sendDoneUriMessage(CrawlUriMessage sourceUriMessage, String wonNodeUri, Collection<String> etags) {
    long crawlDate = System.currentTimeMillis();
    CrawlUriMessage uriDoneMsg = new CrawlUriMessage(sourceUriMessage.getUri(), sourceUriMessage.getBaseUri(), wonNodeUri, CrawlUriMessage.STATUS.DONE, crawlDate, etags);
    String ifNoneMatch = sourceUriMessage.getResourceETagHeaderValues() != null ? String.join(", ", sourceUriMessage.getResourceETagHeaderValues()) : "<None>";
    String responseETags = etags != null ? String.join(", ", etags) : "<None>";
    log.debug("Crawling done for URI {} with ETag Header Values {} (If-None-Match request value: {})", uriDoneMsg.getUri(), responseETags, ifNoneMatch);
    getSender().tell(uriDoneMsg, getSelf());
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage)

Aggregations

CrawlUriMessage (won.matcher.service.crawler.msg.CrawlUriMessage)10 ResourceCrawlUriMessage (won.matcher.service.crawler.msg.ResourceCrawlUriMessage)6 DistributedPubSubMediator (akka.cluster.pubsub.DistributedPubSubMediator)2 Dataset (org.apache.jena.query.Dataset)1 Lock (org.apache.jena.shared.Lock)1 HttpHeaders (org.springframework.http.HttpHeaders)1 RestClientException (org.springframework.web.client.RestClientException)1 NeedEvent (won.matcher.service.common.event.NeedEvent)1 WonNodeEvent (won.matcher.service.common.event.WonNodeEvent)1 CrawlWrapperException (won.matcher.service.crawler.exception.CrawlWrapperException)1 IncorrectPropertyCountException (won.protocol.exception.IncorrectPropertyCountException)1 NeedState (won.protocol.model.NeedState)1 DatasetResponseWithStatusCodeAndHeaders (won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders)1 NeedModelWrapper (won.protocol.util.NeedModelWrapper)1