Search in sources :

Example 1 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class CrawlSparqlService method bulkUpdateCrawlingMetadata.

/**
 * Bulk update of several meta data messages about the crawling process using a separate graph.
 * @param msgs multiple messages that describe crawling meta data to update
 */
public void bulkUpdateCrawlingMetadata(Collection<CrawlUriMessage> msgs) {
    StringBuilder builder = new StringBuilder();
    for (CrawlUriMessage msg : msgs) {
        builder.append(createUpdateCrawlingMetadataQuery(msg));
    }
    // execute the bulk query
    executeUpdateQuery(builder.toString());
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage)

Example 2 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class CrawlSparqlService method retrieveMessagesForCrawling.

/**
 * Gets all messages saved in the db of a certain status (e.g. FAILED) and puts
 * them in the STATUS PROCESS to be able to execute the crawling again.
 *
 * @param status
 * @return
 */
public Set<CrawlUriMessage> retrieveMessagesForCrawling(CrawlUriMessage.STATUS status) {
    Set<CrawlUriMessage> msgs = new LinkedHashSet<>();
    String queryString = "SELECT ?uri ?base ?wonNode (group_concat(distinct ?etag;separator=\"" + HTTP_HEADER_SEPARATOR + "\") as ?etags)" + " WHERE { GRAPH won:crawlMetadata {\n" + " ?uri ?p ?status.\n" + " ?uri won:crawlBaseUri ?base.\n" + " OPTIONAL { ?uri won:wonNodeUri ?wonNode }\n" + " OPTIONAL { ?uri won:resourceETagValue ?etag }}}\n" + " GROUP BY ?uri ?base ?wonNode\n";
    ParameterizedSparqlString pps = new ParameterizedSparqlString();
    pps.setNsPrefix("won", "http://purl.org/webofneeds/model#");
    pps.setCommandText(queryString);
    pps.setLiteral("status", status.toString());
    log.debug("Query SPARQL Endpoint: {}", sparqlEndpoint);
    log.debug("Execute query: {}", pps.toString());
    QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlEndpoint, pps.asQuery());
    ResultSet results = qexec.execSelect();
    while (results.hasNext()) {
        QuerySolution qs = results.nextSolution();
        String uri = qs.get("uri").asResource().getURI();
        String baseUri = qs.get("base").asResource().getURI();
        CrawlUriMessage msg = null;
        String wonNode = null;
        Set<String> etags = null;
        if (qs.get("wonNode") != null) {
            wonNode = qs.get("wonNode").asResource().getURI();
        }
        if (qs.get("etags") != null) {
            String etagsString = qs.get("etags").asLiteral().getString();
            etags = commaConcatenatedStringToSet(etagsString);
        }
        msg = new CrawlUriMessage(uri, baseUri, wonNode, CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), etags);
        log.debug("Created message: {}", msg);
        msgs.add(msg);
    }
    qexec.close();
    return msgs;
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage)

Example 3 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class CrawlSparqlService method extractCrawlUriMessagesForPropertyPath.

/**
 * Extract linked URIs of resource URI and create new CrawlUriMessages for a certain property path and a base Uri.
 * Also extract ETag values if they are available for certain uri resources so that they can be used
 * to make crawling more efficient. Use specified property paths to construct the query.
 *
 * @param baseUri  base uri of the current processed resource uri message
 * @param wonNodeUri won node rui of the current processed resource uri message
 * @param propertyPath property path used to extract new uris in conjunction with base uri
 * @param baseProperty base uri used to extract new uris in conjunction property path
 * @return set of CrawlUriMessages extracted using a certain base uri and property path
 */
private Set<CrawlUriMessage> extractCrawlUriMessagesForPropertyPath(String baseUri, String wonNodeUri, String propertyPath, boolean baseProperty) {
    if (propertyPath.trim().length() == 0) {
        return null;
    }
    // select URIs specified by property paths that have not already been crawled
    Set<CrawlUriMessage> newCrawlMessages = new HashSet<CrawlUriMessage>();
    long crawlDate = System.currentTimeMillis();
    // we have to query the baseUri with and without trailing slahes cause we don't know how the RDF data
    // is described in detail. Usually the "need" prefix ends with a trailing "slash" but we don't assume
    // here that is always the case, so we query both variants: with and without trailing slashes.
    // Check the need list with its need: rdfs:member entries for example
    String queryString = "SELECT ?uri (group_concat(distinct ?etag;separator=\"" + HTTP_HEADER_SEPARATOR + "\") as ?etags) WHERE {\n" + "{ ?baseUriWithTrailingSlash " + propertyPath + // propertyPath has to be appended manually because it contains ">" character and ParameterizedSparqlString cause of injection risk
    " ?uri. } \n" + "UNION { ?baseUriWithoutTrailingSlash " + propertyPath + // propertyPath has to be appended manually because it contains ">" character and ParameterizedSparqlString cause of injection risk
    " ?uri. } \n" + " OPTIONAL {?uri won:resourceETagValue ?etag. }}\n" + " GROUP BY ?uri\n";
    ParameterizedSparqlString pps = new ParameterizedSparqlString();
    pps.setNsPrefix("won", "http://purl.org/webofneeds/model#");
    pps.setCommandText(queryString);
    baseUri = baseUri.trim();
    if (baseUri.endsWith("/")) {
        baseUri = baseUri.substring(0, baseUri.length() - 1);
    }
    pps.setIri("baseUriWithoutTrailingSlash", baseUri);
    pps.setIri("baseUriWithTrailingSlash", baseUri + "/");
    log.debug("Query SPARQL Endpoint: {}", sparqlEndpoint);
    log.debug("Execute query: {}", pps.toString());
    QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlEndpoint, pps.asQuery());
    ResultSet results = qexec.execSelect();
    while (results.hasNext()) {
        QuerySolution qs = results.nextSolution();
        String extractedUri = qs.get("uri").asResource().getURI();
        Set<String> etags = null;
        if (qs.get("etags") != null) {
            String etagsString = qs.get("etags").asLiteral().getString();
            etags = commaConcatenatedStringToSet(etagsString);
        }
        CrawlUriMessage newUriMsg = null;
        log.debug("Extracted URI: {}", extractedUri);
        if (baseProperty) {
            newUriMsg = new CrawlUriMessage(extractedUri, extractedUri, wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate, etags);
        } else {
            newUriMsg = new CrawlUriMessage(extractedUri, baseUri, wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate, etags);
        }
        newCrawlMessages.add(newUriMsg);
    }
    qexec.close();
    return newCrawlMessages;
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage)

Example 4 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class MasterCrawlerActor method preStart.

@Override
public void preStart() {
    // Create a scheduler to execute the life check for each won node regularly
    getContext().system().scheduler().schedule(config.getRecrawlIntervalDuration(), config.getRecrawlIntervalDuration(), getSelf(), RECRAWL_TICK, getContext().dispatcher(), null);
    // Create the router/pool with worker actors that do the actual crawling
    crawlingWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).fromConfigProps(WorkerCrawlerActor.class), "CrawlingRouter");
    // create a single meta data update actor for all worker actors
    updateMetaDataWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(UpdateMetadataActor.class), "MetaDataUpdateWorker");
    getContext().watch(updateMetaDataWorker);
    // create an need loading actor
    getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(NeedEventLoaderActor.class), "NeedEventLoader");
    // subscribe for won node events
    pubSubMediator = DistributedPubSub.get(getContext().system()).mediator();
    pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(WonNodeEvent.class.getName(), getSelf()), getSelf());
    // subscribe to crawl events
    pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(CrawlUriMessage.class.getName(), getSelf()), getSelf());
    pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(ResourceCrawlUriMessage.class.getName(), getSelf()), getSelf());
    // load the unfinished uris and start crawling
    for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.PROCESS)) {
        pendingMessages.put(msg.getUri(), msg);
        crawlingWorker.tell(msg, getSelf());
    }
    for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.FAILED)) {
        getSelf().tell(msg, getSelf());
    }
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) DistributedPubSubMediator(akka.cluster.pubsub.DistributedPubSubMediator)

Example 5 with CrawlUriMessage

use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.

the class MasterCrawlerActor method onReceive.

/**
 * Process {@link won.matcher.service.crawler.msg.CrawlUriMessage} objects
 *
 * @param message
 */
@Override
public void onReceive(final Object message) throws InterruptedException {
    if (message.equals(RECRAWL_TICK)) {
        askWonNodeInfoForCrawling();
    } else if (message instanceof WonNodeEvent) {
        processWonNodeEvent((WonNodeEvent) message);
    } else if (message instanceof CrawlUriMessage) {
        CrawlUriMessage uriMsg = (CrawlUriMessage) message;
        processCrawlUriMessage(uriMsg);
        log.debug("Number of pending messages: {}", pendingMessages.size());
    } else {
        unhandled(message);
    }
}
Also used : CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) WonNodeEvent(won.matcher.service.common.event.WonNodeEvent)

Aggregations

CrawlUriMessage (won.matcher.service.crawler.msg.CrawlUriMessage)10 ResourceCrawlUriMessage (won.matcher.service.crawler.msg.ResourceCrawlUriMessage)6 DistributedPubSubMediator (akka.cluster.pubsub.DistributedPubSubMediator)2 Dataset (org.apache.jena.query.Dataset)1 Lock (org.apache.jena.shared.Lock)1 HttpHeaders (org.springframework.http.HttpHeaders)1 RestClientException (org.springframework.web.client.RestClientException)1 NeedEvent (won.matcher.service.common.event.NeedEvent)1 WonNodeEvent (won.matcher.service.common.event.WonNodeEvent)1 CrawlWrapperException (won.matcher.service.crawler.exception.CrawlWrapperException)1 IncorrectPropertyCountException (won.protocol.exception.IncorrectPropertyCountException)1 NeedState (won.protocol.model.NeedState)1 DatasetResponseWithStatusCodeAndHeaders (won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders)1 NeedModelWrapper (won.protocol.util.NeedModelWrapper)1