Search in sources :

Example 1 with NeedEvent

use of won.matcher.service.common.event.NeedEvent in project webofneeds by researchstudio-sat.

the class CrawlSparqlService method retrieveActiveNeedEvents.

public BulkNeedEvent retrieveActiveNeedEvents(long fromDate, long toDate, int offset, int limit, boolean sortAscending) {
    // query template to retrieve all alctive cralwed/saved needs in a certain date range
    String orderClause = sortAscending ? "ORDER BY ?date\n" : "ORDER BY DESC(?date)\n";
    log.debug("bulk load need data from sparql endpoint in date range: [{},{}]", fromDate, toDate);
    String queryTemplate = "SELECT ?needUri ?wonNodeUri ?date WHERE {  \n" + "  ?needUri a won:Need. \n" + "  ?needUri won:crawlDate ?date.  \n" + "  ?needUri won:isInState won:Active. \n" + "  ?needUri won:hasWonNode ?wonNodeUri. \n" + "  {?needUri won:crawlStatus 'SAVE'.} UNION {?needUri won:crawlStatus 'DONE'.}\n" + "  FILTER (?date >= ?fromDate && ?date < ?toDate ) \n" + "} " + orderClause + " OFFSET ?offset\n" + " LIMIT ?limit";
    ParameterizedSparqlString pps = new ParameterizedSparqlString();
    pps.setNsPrefix("won", "http://purl.org/webofneeds/model#");
    pps.setCommandText(queryTemplate);
    pps.setLiteral("fromDate", fromDate);
    pps.setLiteral("toDate", toDate);
    pps.setLiteral("offset", offset);
    pps.setLiteral("limit", limit);
    log.debug("Query SPARQL Endpoint: {}", sparqlEndpoint);
    log.debug("Execute query: {}", pps.toString());
    QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlEndpoint, pps.asQuery());
    ResultSet results = qexec.execSelect();
    // load all the needs into one bulk need event
    BulkNeedEvent bulkNeedEvent = new BulkNeedEvent();
    while (results.hasNext()) {
        QuerySolution qs = results.nextSolution();
        String needUri = qs.get("needUri").asResource().getURI();
        String wonNodeUri = qs.get("wonNodeUri").asResource().getURI();
        long crawlDate = qs.getLiteral("date").getLong();
        Dataset ds = retrieveNeedDataset(needUri);
        StringWriter sw = new StringWriter();
        RDFDataMgr.write(sw, ds, RDFFormat.TRIG.getLang());
        NeedEvent needEvent = new NeedEvent(needUri, wonNodeUri, NeedEvent.TYPE.ACTIVE, crawlDate, sw.toString(), RDFFormat.TRIG.getLang());
        bulkNeedEvent.addNeedEvent(needEvent);
    }
    qexec.close();
    log.debug("number of need events created: " + bulkNeedEvent.getNeedEvents().size());
    return bulkNeedEvent;
}
Also used : BulkNeedEvent(won.matcher.service.common.event.BulkNeedEvent) StringWriter(java.io.StringWriter) NeedEvent(won.matcher.service.common.event.NeedEvent) BulkNeedEvent(won.matcher.service.common.event.BulkNeedEvent)

Example 2 with NeedEvent

use of won.matcher.service.common.event.NeedEvent in project webofneeds by researchstudio-sat.

the class NeedConsumerProtocolActor method onReceive.

@Override
public void onReceive(final Object message) throws Exception {
    if (message instanceof CamelMessage) {
        CamelMessage camelMsg = (CamelMessage) message;
        String needUri = (String) camelMsg.getHeaders().get(MSG_HEADER_NEED_URI);
        String wonNodeUri = (String) camelMsg.getHeaders().get(MSG_HEADER_WON_NODE_URI);
        // monitoring code
        monitoringService.startClock(MonitoringService.NEED_HINT_STOPWATCH, needUri);
        // process the incoming need event
        if (needUri != null && wonNodeUri != null) {
            Object methodName = camelMsg.getHeaders().get(MSG_HEADER_METHODNAME);
            if (methodName != null) {
                log.debug("Received event '{}' for needUri '{}' and wonNeedUri '{}' and publish it to matchers", methodName, needUri, wonNodeUri);
                // publish a need event to all the (distributed) matchers
                NeedEvent event = null;
                long crawlDate = System.currentTimeMillis();
                if (methodName.equals(MSG_HEADER_METHODNAME_NEEDCREATED)) {
                    event = new NeedEvent(needUri, wonNodeUri, NeedEvent.TYPE.ACTIVE, crawlDate, camelMsg.body().toString(), Lang.TRIG);
                    pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                } else if (methodName.equals(MSG_HEADER_METHODNAME_NEEDACTIVATED)) {
                    event = new NeedEvent(needUri, wonNodeUri, NeedEvent.TYPE.ACTIVE, crawlDate, camelMsg.body().toString(), Lang.TRIG);
                    pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                } else if (methodName.equals(MSG_HEADER_METHODNAME_NEEDDEACTIVATED)) {
                    event = new NeedEvent(needUri, wonNodeUri, NeedEvent.TYPE.INACTIVE, crawlDate, camelMsg.body().toString(), Lang.TRIG);
                    pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                } else {
                    unhandled(message);
                }
                // let the crawler save the data of this event too
                ResourceCrawlUriMessage resMsg = new ResourceCrawlUriMessage(needUri, needUri, wonNodeUri, CrawlUriMessage.STATUS.SAVE, crawlDate, null);
                resMsg.setSerializedResource(camelMsg.body().toString());
                resMsg.setSerializationFormat(Lang.TRIG);
                pubSubMediator.tell(new DistributedPubSubMediator.Publish(resMsg.getClass().getName(), resMsg), getSelf());
                return;
            }
        }
    }
    unhandled(message);
}
Also used : DistributedPubSubMediator(akka.cluster.pubsub.DistributedPubSubMediator) NeedEvent(won.matcher.service.common.event.NeedEvent) CamelMessage(akka.camel.CamelMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage)

Example 3 with NeedEvent

use of won.matcher.service.common.event.NeedEvent in project webofneeds by researchstudio-sat.

the class SaveNeedEventActor method onReceive.

@Override
public void onReceive(final Object o) throws Exception {
    if (o instanceof NeedEvent) {
        NeedEvent needEvent = (NeedEvent) o;
        // save the need
        log.debug("Save need event {} to sparql endpoint {}", needEvent, sparqlService.getSparqlEndpoint());
        Dataset ds = needEvent.deserializeNeedDataset();
        sparqlService.updateNamedGraphsOfDataset(ds);
    } else {
        unhandled(o);
    }
}
Also used : Dataset(org.apache.jena.query.Dataset) NeedEvent(won.matcher.service.common.event.NeedEvent)

Example 4 with NeedEvent

use of won.matcher.service.common.event.NeedEvent in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method crawlUri.

private void crawlUri(CrawlUriMessage uriMsg) {
    Dataset ds = null;
    List<String> etags = null;
    Lock lock = null;
    try {
        // check if resource is already downloaded
        if (uriMsg instanceof ResourceCrawlUriMessage) {
            ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
            if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
                // TODO: this should be optimized, why deserialize the resource here when we just want to save it in the RDF
                // store? How to insert this serialized resource into the SPARQL endpoint?
                ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
            }
        }
        // download resource if not already downloaded
        if (ds == null) {
            // use ETag/If-None-Match Headers to make the process more efficient
            HttpHeaders httpHeaders = new HttpHeaders();
            if (uriMsg.getResourceETagHeaderValues() != null && !uriMsg.getResourceETagHeaderValues().isEmpty()) {
                String ifNoneMatchHeaderValue = StringUtils.collectionToDelimitedString(uriMsg.getResourceETagHeaderValues(), ", ");
                httpHeaders.add("If-None-Match", ifNoneMatchHeaderValue);
            }
            DatasetResponseWithStatusCodeAndHeaders datasetWithHeaders = linkedDataSource.getDatasetWithHeadersForResource(URI.create(uriMsg.getUri()), httpHeaders);
            ds = datasetWithHeaders.getDataset();
            etags = datasetWithHeaders.getResponseHeaders().get("ETag");
            // if dataset was not modified (304) we can treat the current crawl uri as done
            if (ds == null && datasetWithHeaders.getStatusCode() == 304) {
                sendDoneUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
                return;
            }
            // if there is paging activated and the won node tells us that there is more data (previous link)
            // to be downloaded, then we add this link to the crawling process too
            String prevLink = linkedDataSource.getPreviousLinkFromDatasetWithHeaders(datasetWithHeaders);
            if (prevLink != null) {
                CrawlUriMessage newUriMsg = new CrawlUriMessage(uriMsg.getBaseUri(), prevLink, uriMsg.getWonNodeUri(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null);
                getSender().tell(newUriMsg, getSelf());
            }
        }
        lock = ds == null ? null : ds.getLock();
        lock.enterCriticalSection(true);
        // Save dataset to triple store
        sparqlService.updateNamedGraphsOfDataset(ds);
        String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
        if (wonNodeUri == null) {
            wonNodeUri = uriMsg.getWonNodeUri();
        }
        // do nothing more here if the STATUS of the message was SAVE
        if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
            log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
            return;
        }
        // extract URIs from current resource and send extracted URI messages back to sender
        log.debug("Extract URIs from message {}", uriMsg);
        Set<CrawlUriMessage> newCrawlMessages = sparqlService.extractCrawlUriMessages(uriMsg.getBaseUri(), wonNodeUri);
        for (CrawlUriMessage newMsg : newCrawlMessages) {
            getSender().tell(newMsg, getSelf());
        }
        // signal sender that this URI is processed and save meta data about crawling the URI.
        // This needs to be done after all extracted URI messages have been sent to guarantee consistency
        // in case of failure
        sendDoneUriMessage(uriMsg, wonNodeUri, etags);
        // if this URI/dataset was a need then send an event to the distributed event bu
        if (NeedModelWrapper.isANeed(ds)) {
            NeedModelWrapper needModelWrapper = new NeedModelWrapper(ds, false);
            NeedState state = needModelWrapper.getNeedState();
            NeedEvent.TYPE type = state.equals(NeedState.ACTIVE) ? NeedEvent.TYPE.ACTIVE : NeedEvent.TYPE.INACTIVE;
            log.debug("Created need event for need uri {}", uriMsg.getUri());
            long crawlDate = System.currentTimeMillis();
            NeedEvent needEvent = new NeedEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds);
            pubSubMediator.tell(new DistributedPubSubMediator.Publish(needEvent.getClass().getName(), needEvent), getSelf());
        }
    } catch (RestClientException e1) {
        // usually happens if the fetch of the dataset fails e.g. HttpServerErrorException, HttpClientErrorException
        log.debug("Exception during crawling: " + e1);
        throw new CrawlWrapperException(e1, uriMsg);
    } catch (Exception e) {
        log.debug("Exception during crawling: " + e);
        throw new CrawlWrapperException(e, uriMsg);
    } finally {
        if (lock != null) {
            lock.leaveCriticalSection();
        }
    }
}
Also used : HttpHeaders(org.springframework.http.HttpHeaders) CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) DatasetResponseWithStatusCodeAndHeaders(won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders) DistributedPubSubMediator(akka.cluster.pubsub.DistributedPubSubMediator) Dataset(org.apache.jena.query.Dataset) NeedState(won.protocol.model.NeedState) NeedModelWrapper(won.protocol.util.NeedModelWrapper) NeedEvent(won.matcher.service.common.event.NeedEvent) CrawlWrapperException(won.matcher.service.crawler.exception.CrawlWrapperException) CrawlWrapperException(won.matcher.service.crawler.exception.CrawlWrapperException) IncorrectPropertyCountException(won.protocol.exception.IncorrectPropertyCountException) RestClientException(org.springframework.web.client.RestClientException) Lock(org.apache.jena.shared.Lock) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) RestClientException(org.springframework.web.client.RestClientException)

Example 5 with NeedEvent

use of won.matcher.service.common.event.NeedEvent in project webofneeds by researchstudio-sat.

the class SolrTest method createNeedEvent.

private static NeedEvent createNeedEvent(String path) throws IOException {
    InputStream is = null;
    Dataset dataset = null;
    try {
        try {
            is = SolrTest.class.getResourceAsStream(path);
            dataset = DatasetFactory.create();
            RDFDataMgr.read(dataset, is, RDFFormat.TRIG.getLang());
        } finally {
            if (is != null) {
                is.close();
            }
        }
    } catch (IOException e) {
        System.err.println(e);
        return null;
    }
    String needUri = WonRdfUtils.NeedUtils.getNeedURI(dataset).toString();
    return new NeedEvent(needUri, "no_uri", NeedEvent.TYPE.ACTIVE, System.currentTimeMillis(), dataset);
}
Also used : InputStream(java.io.InputStream) Dataset(org.apache.jena.query.Dataset) NeedEvent(won.matcher.service.common.event.NeedEvent) IOException(java.io.IOException)

Aggregations

NeedEvent (won.matcher.service.common.event.NeedEvent)6 Dataset (org.apache.jena.query.Dataset)3 DistributedPubSubMediator (akka.cluster.pubsub.DistributedPubSubMediator)2 ResourceCrawlUriMessage (won.matcher.service.crawler.msg.ResourceCrawlUriMessage)2 ActorRef (akka.actor.ActorRef)1 ActorSystem (akka.actor.ActorSystem)1 CamelMessage (akka.camel.CamelMessage)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 StringWriter (java.io.StringWriter)1 Lock (org.apache.jena.shared.Lock)1 AnnotationConfigApplicationContext (org.springframework.context.annotation.AnnotationConfigApplicationContext)1 HttpHeaders (org.springframework.http.HttpHeaders)1 RestClientException (org.springframework.web.client.RestClientException)1 BulkNeedEvent (won.matcher.service.common.event.BulkNeedEvent)1 CrawlWrapperException (won.matcher.service.crawler.exception.CrawlWrapperException)1 CrawlUriMessage (won.matcher.service.crawler.msg.CrawlUriMessage)1 SolrMatcherActor (won.matcher.solr.actor.SolrMatcherActor)1 IncorrectPropertyCountException (won.protocol.exception.IncorrectPropertyCountException)1 NeedState (won.protocol.model.NeedState)1