use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.
the class CrawlSparqlService method bulkUpdateCrawlingMetadata.
/**
* Bulk update of several meta data messages about the crawling process using a separate graph.
* @param msgs multiple messages that describe crawling meta data to update
*/
public void bulkUpdateCrawlingMetadata(Collection<CrawlUriMessage> msgs) {
StringBuilder builder = new StringBuilder();
for (CrawlUriMessage msg : msgs) {
builder.append(createUpdateCrawlingMetadataQuery(msg));
}
// execute the bulk query
executeUpdateQuery(builder.toString());
}
use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.
the class CrawlSparqlService method retrieveMessagesForCrawling.
/**
* Gets all messages saved in the db of a certain status (e.g. FAILED) and puts
* them in the STATUS PROCESS to be able to execute the crawling again.
*
* @param status
* @return
*/
public Set<CrawlUriMessage> retrieveMessagesForCrawling(CrawlUriMessage.STATUS status) {
Set<CrawlUriMessage> msgs = new LinkedHashSet<>();
String queryString = "SELECT ?uri ?base ?wonNode (group_concat(distinct ?etag;separator=\"" + HTTP_HEADER_SEPARATOR + "\") as ?etags)" + " WHERE { GRAPH won:crawlMetadata {\n" + " ?uri ?p ?status.\n" + " ?uri won:crawlBaseUri ?base.\n" + " OPTIONAL { ?uri won:wonNodeUri ?wonNode }\n" + " OPTIONAL { ?uri won:resourceETagValue ?etag }}}\n" + " GROUP BY ?uri ?base ?wonNode\n";
ParameterizedSparqlString pps = new ParameterizedSparqlString();
pps.setNsPrefix("won", "http://purl.org/webofneeds/model#");
pps.setCommandText(queryString);
pps.setLiteral("status", status.toString());
log.debug("Query SPARQL Endpoint: {}", sparqlEndpoint);
log.debug("Execute query: {}", pps.toString());
QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlEndpoint, pps.asQuery());
ResultSet results = qexec.execSelect();
while (results.hasNext()) {
QuerySolution qs = results.nextSolution();
String uri = qs.get("uri").asResource().getURI();
String baseUri = qs.get("base").asResource().getURI();
CrawlUriMessage msg = null;
String wonNode = null;
Set<String> etags = null;
if (qs.get("wonNode") != null) {
wonNode = qs.get("wonNode").asResource().getURI();
}
if (qs.get("etags") != null) {
String etagsString = qs.get("etags").asLiteral().getString();
etags = commaConcatenatedStringToSet(etagsString);
}
msg = new CrawlUriMessage(uri, baseUri, wonNode, CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis(), etags);
log.debug("Created message: {}", msg);
msgs.add(msg);
}
qexec.close();
return msgs;
}
use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.
the class CrawlSparqlService method extractCrawlUriMessagesForPropertyPath.
/**
* Extract linked URIs of resource URI and create new CrawlUriMessages for a certain property path and a base Uri.
* Also extract ETag values if they are available for certain uri resources so that they can be used
* to make crawling more efficient. Use specified property paths to construct the query.
*
* @param baseUri base uri of the current processed resource uri message
* @param wonNodeUri won node rui of the current processed resource uri message
* @param propertyPath property path used to extract new uris in conjunction with base uri
* @param baseProperty base uri used to extract new uris in conjunction property path
* @return set of CrawlUriMessages extracted using a certain base uri and property path
*/
private Set<CrawlUriMessage> extractCrawlUriMessagesForPropertyPath(String baseUri, String wonNodeUri, String propertyPath, boolean baseProperty) {
if (propertyPath.trim().length() == 0) {
return null;
}
// select URIs specified by property paths that have not already been crawled
Set<CrawlUriMessage> newCrawlMessages = new HashSet<CrawlUriMessage>();
long crawlDate = System.currentTimeMillis();
// we have to query the baseUri with and without trailing slahes cause we don't know how the RDF data
// is described in detail. Usually the "need" prefix ends with a trailing "slash" but we don't assume
// here that is always the case, so we query both variants: with and without trailing slashes.
// Check the need list with its need: rdfs:member entries for example
String queryString = "SELECT ?uri (group_concat(distinct ?etag;separator=\"" + HTTP_HEADER_SEPARATOR + "\") as ?etags) WHERE {\n" + "{ ?baseUriWithTrailingSlash " + propertyPath + // propertyPath has to be appended manually because it contains ">" character and ParameterizedSparqlString cause of injection risk
" ?uri. } \n" + "UNION { ?baseUriWithoutTrailingSlash " + propertyPath + // propertyPath has to be appended manually because it contains ">" character and ParameterizedSparqlString cause of injection risk
" ?uri. } \n" + " OPTIONAL {?uri won:resourceETagValue ?etag. }}\n" + " GROUP BY ?uri\n";
ParameterizedSparqlString pps = new ParameterizedSparqlString();
pps.setNsPrefix("won", "http://purl.org/webofneeds/model#");
pps.setCommandText(queryString);
baseUri = baseUri.trim();
if (baseUri.endsWith("/")) {
baseUri = baseUri.substring(0, baseUri.length() - 1);
}
pps.setIri("baseUriWithoutTrailingSlash", baseUri);
pps.setIri("baseUriWithTrailingSlash", baseUri + "/");
log.debug("Query SPARQL Endpoint: {}", sparqlEndpoint);
log.debug("Execute query: {}", pps.toString());
QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlEndpoint, pps.asQuery());
ResultSet results = qexec.execSelect();
while (results.hasNext()) {
QuerySolution qs = results.nextSolution();
String extractedUri = qs.get("uri").asResource().getURI();
Set<String> etags = null;
if (qs.get("etags") != null) {
String etagsString = qs.get("etags").asLiteral().getString();
etags = commaConcatenatedStringToSet(etagsString);
}
CrawlUriMessage newUriMsg = null;
log.debug("Extracted URI: {}", extractedUri);
if (baseProperty) {
newUriMsg = new CrawlUriMessage(extractedUri, extractedUri, wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate, etags);
} else {
newUriMsg = new CrawlUriMessage(extractedUri, baseUri, wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate, etags);
}
newCrawlMessages.add(newUriMsg);
}
qexec.close();
return newCrawlMessages;
}
use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.
the class MasterCrawlerActor method preStart.
@Override
public void preStart() {
// Create a scheduler to execute the life check for each won node regularly
getContext().system().scheduler().schedule(config.getRecrawlIntervalDuration(), config.getRecrawlIntervalDuration(), getSelf(), RECRAWL_TICK, getContext().dispatcher(), null);
// Create the router/pool with worker actors that do the actual crawling
crawlingWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).fromConfigProps(WorkerCrawlerActor.class), "CrawlingRouter");
// create a single meta data update actor for all worker actors
updateMetaDataWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(UpdateMetadataActor.class), "MetaDataUpdateWorker");
getContext().watch(updateMetaDataWorker);
// create an need loading actor
getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(NeedEventLoaderActor.class), "NeedEventLoader");
// subscribe for won node events
pubSubMediator = DistributedPubSub.get(getContext().system()).mediator();
pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(WonNodeEvent.class.getName(), getSelf()), getSelf());
// subscribe to crawl events
pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(CrawlUriMessage.class.getName(), getSelf()), getSelf());
pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(ResourceCrawlUriMessage.class.getName(), getSelf()), getSelf());
// load the unfinished uris and start crawling
for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.PROCESS)) {
pendingMessages.put(msg.getUri(), msg);
crawlingWorker.tell(msg, getSelf());
}
for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.FAILED)) {
getSelf().tell(msg, getSelf());
}
}
use of won.matcher.service.crawler.msg.CrawlUriMessage in project webofneeds by researchstudio-sat.
the class MasterCrawlerActor method onReceive.
/**
* Process {@link won.matcher.service.crawler.msg.CrawlUriMessage} objects
*
* @param message
*/
@Override
public void onReceive(final Object message) throws InterruptedException {
if (message.equals(RECRAWL_TICK)) {
askWonNodeInfoForCrawling();
} else if (message instanceof WonNodeEvent) {
processWonNodeEvent((WonNodeEvent) message);
} else if (message instanceof CrawlUriMessage) {
CrawlUriMessage uriMsg = (CrawlUriMessage) message;
processCrawlUriMessage(uriMsg);
log.debug("Number of pending messages: {}", pendingMessages.size());
} else {
unhandled(message);
}
}
Aggregations