Search in sources :

Example 1 with AtomEvent

use of won.matcher.service.common.event.AtomEvent in project webofneeds by researchstudio-sat.

the class SolrMatcherActor method onReceive.

@Override
public void onReceive(final Object o) throws Exception {
    String eventTypeForLogging = "unknown";
    Optional<String> uriForLogging = Optional.empty();
    try {
        if (o instanceof AtomEvent) {
            eventTypeForLogging = "AtomEvent";
            AtomEvent atomEvent = (AtomEvent) o;
            uriForLogging = Optional.ofNullable(atomEvent.getUri());
            if (atomEvent.getEventType().equals(AtomEvent.TYPE.ACTIVE)) {
                processActiveAtomEvent(atomEvent);
            } else if (atomEvent.getEventType().equals(AtomEvent.TYPE.INACTIVE)) {
                processInactiveAtomEvent(atomEvent);
            } else {
                unhandled(o);
            }
        } else if (o instanceof BulkAtomEvent) {
            eventTypeForLogging = "BulkAtomEvent";
            log.info("received bulk atom event, processing {} atom events ...", ((BulkAtomEvent) o).getAtomEvents().size());
            for (AtomEvent event : ((BulkAtomEvent) o).getAtomEvents()) {
                processActiveAtomEvent(event);
            }
        } else {
            eventTypeForLogging = "unhandled";
            unhandled(o);
        }
    } catch (Exception e) {
        log.info(String.format("Caught exception when processing %s event %s. More info on loglevel 'debug'", eventTypeForLogging, uriForLogging.orElse("[no uri available]")));
        log.debug("caught exception", e);
    }
}
Also used : BulkAtomEvent(won.matcher.service.common.event.BulkAtomEvent) BulkAtomEvent(won.matcher.service.common.event.BulkAtomEvent) AtomEvent(won.matcher.service.common.event.AtomEvent) SolrServerException(org.apache.solr.client.solrj.SolrServerException) IOException(java.io.IOException)

Example 2 with AtomEvent

use of won.matcher.service.common.event.AtomEvent in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method sendDeletedAtomMessage.

private void sendDeletedAtomMessage(String atomUri, String wonNodeUri) {
    AtomEvent event = new AtomEvent(atomUri, wonNodeUri, AtomEvent.TYPE.DELETED, System.currentTimeMillis(), null, Cause.CRAWLED);
    getSender().tell(event, getSelf());
}
Also used : AtomEvent(won.matcher.service.common.event.AtomEvent)

Example 3 with AtomEvent

use of won.matcher.service.common.event.AtomEvent in project webofneeds by researchstudio-sat.

the class WorkerCrawlerActor method crawlUri.

private void crawlUri(CrawlUriMessage uriMsg) {
    Dataset ds = null;
    List<String> etags = null;
    Lock lock = null;
    try {
        // check if resource is already downloaded
        if (uriMsg instanceof ResourceCrawlUriMessage) {
            ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
            if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
                // TODO: this should be optimized, why deserialize the resource here when we
                // just want to save it in the RDF
                // store? How to insert this serialized resource into the SPARQL endpoint?
                ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
            }
        }
        // download resource if not already downloaded
        if (ds == null) {
            // use ETag/If-None-Match Headers to make the process more efficient
            HttpHeaders httpHeaders = new HttpHeaders();
            if (uriMsg.getResourceETagHeaderValues() != null && !uriMsg.getResourceETagHeaderValues().isEmpty()) {
                String ifNoneMatchHeaderValue = StringUtils.collectionToDelimitedString(uriMsg.getResourceETagHeaderValues(), ", ");
                httpHeaders.add("If-None-Match", ifNoneMatchHeaderValue);
            }
            DatasetResponseWithStatusCodeAndHeaders datasetWithHeaders = linkedDataSource.getDatasetWithHeadersForResource(URI.create(uriMsg.getUri()), httpHeaders);
            ds = datasetWithHeaders.getDataset();
            etags = datasetWithHeaders.getResponseHeaders().get("ETag");
            // if dataset was not modified (304) we can treat the current crawl uri as done
            if (ds == null && datasetWithHeaders.getStatusCode() == 304) {
                sendDoneUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
                return;
            }
            // if there is paging activated and the won node tells us that there is more
            // data (previous link)
            // to be downloaded, then we add this link to the crawling process too
            String prevLink = linkedDataSource.getPreviousLinkFromDatasetWithHeaders(datasetWithHeaders);
            if (prevLink != null) {
                CrawlUriMessage newUriMsg = new CrawlUriMessage(uriMsg.getBaseUri(), prevLink, uriMsg.getWonNodeUri(), CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), null);
                getSender().tell(newUriMsg, getSelf());
            }
        }
        lock = ds == null ? null : ds.getLock();
        lock.enterCriticalSection(true);
        // Save dataset to triple store
        sparqlService.updateNamedGraphsOfDataset(ds);
        String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
        if (wonNodeUri == null) {
            wonNodeUri = uriMsg.getWonNodeUri();
        }
        // do nothing more here if the STATUS of the message was SAVE
        if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
            log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
            return;
        }
        // extract URIs from current resource and send extracted URI messages back to
        // sender
        log.debug("Extract URIs from message {}", uriMsg);
        Set<CrawlUriMessage> newCrawlMessages = sparqlService.extractCrawlUriMessages(uriMsg.getBaseUri(), wonNodeUri);
        for (CrawlUriMessage newMsg : newCrawlMessages) {
            getSender().tell(newMsg, getSelf());
        }
        // signal sender that this URI is processed and save meta data about crawling
        // the URI.
        // This needs to be done after all extracted URI messages have been sent to
        // guarantee consistency
        // in case of failure
        sendDoneUriMessage(uriMsg, wonNodeUri, etags);
        // bu
        if (AtomModelWrapper.isAAtom(ds)) {
            AtomModelWrapper atomModelWrapper = new AtomModelWrapper(ds, false);
            AtomState state = atomModelWrapper.getAtomState();
            AtomEvent.TYPE type = state.equals(AtomState.ACTIVE) ? AtomEvent.TYPE.ACTIVE : AtomEvent.TYPE.INACTIVE;
            log.debug("Created atom event for atom uri {}", uriMsg.getUri());
            long crawlDate = System.currentTimeMillis();
            AtomEvent atomEvent = new AtomEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds, Cause.CRAWLED);
            pubSubMediator.tell(new DistributedPubSubMediator.Publish(atomEvent.getClass().getName(), atomEvent), getSelf());
        }
    } catch (RestClientException e1) {
        // usually happens if the fetch of the dataset fails e.g.
        // HttpServerErrorException, HttpClientErrorException
        log.debug("Exception during crawling: " + e1);
        throw new CrawlWrapperException(e1, uriMsg);
    } catch (LinkedDataFetchingException e) {
        log.debug("Exception during crawling: " + e);
        Throwable cause = e.getCause();
        if (cause instanceof HttpClientErrorException && Objects.equals(((HttpClientErrorException) cause).getStatusCode(), HttpStatus.GONE)) {
            log.debug("Uri used to exist, but has been deleted, deleting from rdf store.");
            sendDeletedAtomMessage(uriMsg.getUri(), uriMsg.getWonNodeUri());
            sendDeletedUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
        } else if (cause instanceof HttpClientErrorException && Objects.equals(((HttpClientErrorException) cause).getStatusCode(), HttpStatus.FORBIDDEN)) {
            log.debug("Not allowed to access uri, marking as done");
            sendDoneUriMessage(uriMsg, uriMsg.getWonNodeUri(), etags);
        } else {
            throw new CrawlWrapperException(e, uriMsg);
        }
    } catch (Exception e) {
        log.debug("Exception during crawling: " + e);
        throw new CrawlWrapperException(e, uriMsg);
    } finally {
        if (lock != null) {
            lock.leaveCriticalSection();
        }
    }
}
Also used : HttpHeaders(org.springframework.http.HttpHeaders) CrawlUriMessage(won.matcher.service.crawler.msg.CrawlUriMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) DatasetResponseWithStatusCodeAndHeaders(won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders) HttpClientErrorException(org.springframework.web.client.HttpClientErrorException) AtomState(won.protocol.model.AtomState) DistributedPubSubMediator(akka.cluster.pubsub.DistributedPubSubMediator) Dataset(org.apache.jena.query.Dataset) CrawlWrapperException(won.matcher.service.crawler.exception.CrawlWrapperException) CrawlWrapperException(won.matcher.service.crawler.exception.CrawlWrapperException) IncorrectPropertyCountException(won.protocol.exception.IncorrectPropertyCountException) RestClientException(org.springframework.web.client.RestClientException) LinkedDataFetchingException(won.protocol.rest.LinkedDataFetchingException) HttpClientErrorException(org.springframework.web.client.HttpClientErrorException) Lock(org.apache.jena.shared.Lock) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage) AtomEvent(won.matcher.service.common.event.AtomEvent) LinkedDataFetchingException(won.protocol.rest.LinkedDataFetchingException) RestClientException(org.springframework.web.client.RestClientException) AtomModelWrapper(won.protocol.util.AtomModelWrapper)

Example 4 with AtomEvent

use of won.matcher.service.common.event.AtomEvent in project webofneeds by researchstudio-sat.

the class AtomConsumerProtocolActor method onReceive.

@Override
public void onReceive(final Object message) throws Exception {
    if (message instanceof CamelMessage) {
        final CamelMessage camelMsg = (CamelMessage) message;
        final String atomUri = (String) camelMsg.getHeaders().get(MSG_HEADER_ATOM_URI);
        final String wonNodeUri = (String) camelMsg.getHeaders().get(MSG_HEADER_WON_NODE_URI);
        // handle in separate thread so the camel message handling isn't held up
        executor.execute(() -> {
            // monitoring code
            monitoringService.startClock(MonitoringService.ATOM_HINT_STOPWATCH, atomUri);
            // process the incoming atom event
            if (atomUri != null && wonNodeUri != null) {
                Object methodName = camelMsg.getHeaders().get(MSG_HEADER_METHODNAME);
                if (methodName != null) {
                    log.debug("Received event '{}' for atomUri '{}' and wonAtomUri '{}' and publish it to matchers", methodName, atomUri, wonNodeUri);
                    // publish an atom event to all the (distributed) matchers
                    AtomEvent event = null;
                    long crawlDate = System.currentTimeMillis();
                    // check for deletion before fetching as we won't be able to fetch anything
                    if (methodName.equals(MSG_HEADER_METHODNAME_ATOMDELETED)) {
                        event = new AtomEvent(atomUri, wonNodeUri, AtomEvent.TYPE.DELETED, crawlDate, Cause.PUSHED);
                        pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                    } else {
                        // not a delete message: fetch the data and generate event
                        Dataset ds = linkedDataSource.getDataForPublicResource(URI.create(atomUri));
                        if (AtomModelWrapper.isAAtom(ds)) {
                            if (methodName.equals(MSG_HEADER_METHODNAME_ATOMCREATED)) {
                                event = new AtomEvent(atomUri, wonNodeUri, AtomEvent.TYPE.ACTIVE, crawlDate, ds, Cause.PUSHED);
                                pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                            } else if (methodName.equals(MSG_HEADER_METHODNAME_ATOMMODIFIED)) {
                                event = new AtomEvent(atomUri, wonNodeUri, AtomEvent.TYPE.ACTIVE, crawlDate, ds, Cause.PUSHED);
                                pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                            } else if (methodName.equals(MSG_HEADER_METHODNAME_ATOMACTIVATED)) {
                                event = new AtomEvent(atomUri, wonNodeUri, AtomEvent.TYPE.ACTIVE, crawlDate, ds, Cause.PUSHED);
                                pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                            } else if (methodName.equals(MSG_HEADER_METHODNAME_ATOMDEACTIVATED)) {
                                event = new AtomEvent(atomUri, wonNodeUri, AtomEvent.TYPE.INACTIVE, crawlDate, ds, Cause.PUSHED);
                                pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
                            } else {
                                unhandled(message);
                            }
                        }
                        // let the crawler save the data of this event too
                        ResourceCrawlUriMessage resMsg = new ResourceCrawlUriMessage(atomUri, atomUri, wonNodeUri, CrawlUriMessage.STATUS.SAVE, crawlDate, null);
                        resMsg.setSerializedResource(camelMsg.body().toString());
                        resMsg.setSerializationFormat(Lang.TRIG);
                        pubSubMediator.tell(new DistributedPubSubMediator.Publish(resMsg.getClass().getName(), resMsg), getSelf());
                        return;
                    }
                } else {
                    log.warning("Message not processed; methodName is null");
                }
            } else {
                log.warning("Message not processed; atomURI or wonNodeURI is null");
            }
        });
    }
    unhandled(message);
}
Also used : DistributedPubSubMediator(akka.cluster.pubsub.DistributedPubSubMediator) AtomEvent(won.matcher.service.common.event.AtomEvent) Dataset(org.apache.jena.query.Dataset) CamelMessage(akka.camel.CamelMessage) ResourceCrawlUriMessage(won.matcher.service.crawler.msg.ResourceCrawlUriMessage)

Example 5 with AtomEvent

use of won.matcher.service.common.event.AtomEvent in project webofneeds by researchstudio-sat.

the class SaveAtomEventActor method onReceive.

@Override
public void onReceive(final Object o) throws Exception {
    if (o instanceof AtomEvent) {
        AtomEvent atomEvent = (AtomEvent) o;
        if (((AtomEvent) o).getEventType() == AtomEvent.TYPE.DELETED) {
            // delete the atom
            sparqlService.deleteAtom(atomEvent.getUri());
            sparqlService.deleteAtomMetadata(atomEvent.getUri());
        } else {
            // save the atom
            log.debug("Save atom event {} to sparql endpoint {}", atomEvent, sparqlService.getSparqlEndpoint());
            Dataset ds = atomEvent.deserializeAtomDataset();
            sparqlService.updateNamedGraphsOfDataset(ds);
        }
    } else {
        unhandled(o);
    }
}
Also used : AtomEvent(won.matcher.service.common.event.AtomEvent) Dataset(org.apache.jena.query.Dataset)

Aggregations

AtomEvent (won.matcher.service.common.event.AtomEvent)14 BulkAtomEvent (won.matcher.service.common.event.BulkAtomEvent)6 Dataset (org.apache.jena.query.Dataset)5 DistributedPubSubMediator (akka.cluster.pubsub.DistributedPubSubMediator)4 IOException (java.io.IOException)3 ActorRef (akka.actor.ActorRef)2 ActorSystem (akka.actor.ActorSystem)2 InputStream (java.io.InputStream)2 StringWriter (java.io.StringWriter)2 AnnotationConfigApplicationContext (org.springframework.context.annotation.AnnotationConfigApplicationContext)2 BulkHintEvent (won.matcher.service.common.event.BulkHintEvent)2 HintEvent (won.matcher.service.common.event.HintEvent)2 LoadAtomEvent (won.matcher.service.common.event.LoadAtomEvent)2 ResourceCrawlUriMessage (won.matcher.service.crawler.msg.ResourceCrawlUriMessage)2 LinkedDataFetchingException (won.protocol.rest.LinkedDataFetchingException)2 CamelMessage (akka.camel.CamelMessage)1 HashSet (java.util.HashSet)1 Lock (org.apache.jena.shared.Lock)1 SolrServerException (org.apache.solr.client.solrj.SolrServerException)1 HttpHeaders (org.springframework.http.HttpHeaders)1