use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class Nif20MetadataEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String lang = EnhancementEngineHelper.getLanguage(ci);
Language language = lang == null ? null : new Language(lang);
//now iterate over the AnalysedText data and create the RDF representation
//TODO: make configureable
boolean sentences = true;
boolean phrases = true;
boolean words = true;
EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
if (sentences) {
activeTypes.add(SpanTypeEnum.Sentence);
}
if (phrases) {
activeTypes.add(SpanTypeEnum.Chunk);
}
if (words) {
activeTypes.add(SpanTypeEnum.Token);
}
Graph metadata = ci.getMetadata();
IRI base = ci.getUri();
ci.getLock().writeLock().lock();
try {
//write the context
IRI text = writeSpan(metadata, base, at, language, at);
metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
Iterator<Span> spans = at.getEnclosed(activeTypes);
IRI sentence = null;
IRI phrase = null;
IRI word = null;
boolean firstWordInSentence = true;
while (spans.hasNext()) {
Span span = spans.next();
//TODO: filter Spans based on additional requirements
//(1) write generic information about the span
IRI current = writeSpan(metadata, base, at, language, span);
//write the context
metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
//(2) add the relations between the different spans
switch(span.getType()) {
case Sentence:
if (sentence != null && writePrevNext) {
metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
metadata.add(new TripleImpl(current, Nif20.previousSentence.getUri(), sentence));
}
if (word != null) {
metadata.add(new TripleImpl(sentence, Nif20.lastWord.getUri(), word));
}
sentence = current;
firstWordInSentence = true;
break;
case Chunk:
if (sentence != null && writeHierary) {
metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
}
phrase = current;
break;
case Token:
if (sentence != null) {
if (writeHierary) {
metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
}
//metadata.add(new TripleImpl(sentence, Nif20.word.getUri(), current));
if (firstWordInSentence) {
metadata.add(new TripleImpl(sentence, Nif20.firstWord.getUri(), current));
firstWordInSentence = false;
}
}
if (writeHierary && phrase != null && !phrase.equals(current)) {
metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
}
if (word != null && writePrevNext) {
metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
}
word = current;
break;
default:
break;
}
//(3) add specific information such as POS, chunk type ...
Nif20Helper.writePhrase(metadata, span, current);
Nif20Helper.writePos(metadata, span, current);
//TODO: sentiment support
Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentiment != null && sentiment.value() != null) {
metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class MetaxaEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
// get model from the extraction
URIImpl docId;
Model m = null;
ci.getLock().readLock().lock();
try {
docId = new URIImpl(ci.getUri().getUnicodeString());
m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} catch (IOException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} finally {
ci.getLock().readLock().unlock();
}
// the extracted plain text from the model
if (null == m) {
log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
return;
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink("text/plain");
} catch (IOException e) {
m.close();
throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
}
HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
RDF2GoUtils.urifyBlankNodes(m);
ClosableIterator<Statement> it = m.iterator();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
//used to detect if some text was extracted
boolean textExtracted = false;
try {
//first add to a temporary graph
Graph g = new SimpleGraph();
while (it.hasNext()) {
Statement oneStmt = it.next();
//the plain text Blob!
if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
String text = oneStmt.getObject().toString();
if (text != null && !text.isEmpty()) {
try {
out.write(oneStmt.getObject().toString());
} catch (IOException e) {
throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
}
textExtracted = true;
if (includeText) {
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
g.add(new TripleImpl(subject, predicate, object));
}
}
} else {
//add metadata to the metadata of the contentItem
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject && null != predicate && null != object) {
Triple t = new TripleImpl(subject, predicate, object);
g.add(t);
log.debug("added " + t.toString());
}
}
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
ci.getMetadata().addAll(g);
g = null;
} finally {
ci.getLock().writeLock().unlock();
}
} finally {
it.close();
m.close();
IOUtils.closeQuietly(out);
}
if (textExtracted) {
//add plain text to the content item
IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
ci.addPart(blobUri, plainTextSink.getBlob());
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class RootResource method getGraph.
private Graph getGraph(String ontologyId, boolean merged, URI requestUri) {
long before = System.currentTimeMillis();
OWLOntologyID key = OntologyUtils.decode(ontologyId);
log.debug("Will try to retrieve ontology {} from provider.", key);
/*
* Export directly to Graph since the OWLOntologyWriter uses (de-)serializing converters for the
* other formats.
*
* Use oTemp for the "real" graph and o for the graph that will be exported. This is due to the fact
* that in o we want to change import statements, but we do not want these changes to be stored
* permanently.
*/
Graph o = null, oTemp = null;
try {
oTemp = ontologyProvider.getStoredOntology(key, Graph.class, merged);
} catch (Exception ex) {
log.warn("Retrieval of ontology with ID " + key + " failed.", ex);
}
if (oTemp == null) {
log.debug("Ontology {} missing from provider. Trying libraries...", key);
// TODO remove once registry supports OWLOntologyID as public key.
IRI iri = URIUtils.sanitize(IRI.create(ontologyId));
// See if we can touch a library. TODO: replace with event model on the ontology provider.
int minSize = -1;
IRI smallest = null;
for (Library lib : registryManager.getLibraries(iri)) {
int size = lib.getChildren().length;
if (minSize < 1 || size < minSize) {
smallest = lib.getIRI();
minSize = size;
}
}
if (smallest != null) {
log.debug("Selected library for ontology {} is {} .", iri, smallest);
try {
oTemp = registryManager.getLibrary(smallest).getOntology(iri, Graph.class);
} catch (RegistryContentException e) {
log.warn("The content of library " + smallest + " could not be accessed.", e);
}
}
}
// resource-intensive IndexedGraph, since both o and oTemp will be GC'ed after serialization.
if (oTemp != null) {
o = new SimpleGraph(oTemp);
}
if (o == null) {
log.debug("Ontology {} not found in any ontology provider or library.", ontologyId);
return null;
}
log.debug("Retrieved ontology {} .", ontologyId);
// Rewrite imports
String uri = uriInfo.getRequestUri().toString();
URI base = URI.create(uri.substring(0, uri.lastIndexOf(ontologyId) - 1));
// Rewrite import statements
/*
* TODO manage import rewrites better once the container ID is fully configurable (i.e. instead of
* going upOne() add "session" or "ontology" if needed).
*/
Iterator<Triple> imports = o.filter(null, OWL.imports, null);
Set<Triple> oldImports = new HashSet<Triple>();
while (imports.hasNext()) {
oldImports.add(imports.next());
}
for (Triple t : oldImports) {
// construct new statement
String s = ((org.apache.clerezza.commons.rdf.IRI) t.getObject()).getUnicodeString();
if (s.contains("::")) {
s = s.substring(s.indexOf("::") + 2, s.length());
}
org.apache.clerezza.commons.rdf.IRI target = new org.apache.clerezza.commons.rdf.IRI(base + "/" + s);
o.add(new TripleImpl(t.getSubject(), OWL.imports, target));
// remove old statement
o.remove(t);
}
// Versioning.
OWLOntologyID id = OWLUtils.extractOntologyID(o);
if (id != null && !id.isAnonymous() && id.getVersionIRI() == null) {
org.apache.clerezza.commons.rdf.IRI viri = new org.apache.clerezza.commons.rdf.IRI(requestUri.toString());
log.debug("Setting version IRI for export : {}", viri);
o.add(new TripleImpl(new org.apache.clerezza.commons.rdf.IRI(id.getOntologyIRI().toString()), new org.apache.clerezza.commons.rdf.IRI(OWL2Constants.OWL_VERSION_IRI), viri));
}
log.debug("Exported as Clerezza ImmutableGraph in {} ms. Handing over to writer.", System.currentTimeMillis() - before);
return o;
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class RootResource method getStandaloneGraph.
@GET
@Path("/{ontologyId:.+}")
@Produces(value = { APPLICATION_JSON, N3, N_TRIPLE, RDF_JSON })
public Response getStandaloneGraph(@PathParam("ontologyId") String ontologyId, @DefaultValue("false") @QueryParam("meta") boolean meta, @DefaultValue("false") @QueryParam("merge") boolean merged, @Context UriInfo uriInfo, @Context HttpHeaders headers) {
if (meta) {
return getMetadata(ontologyId, uriInfo, headers);
}
ResponseBuilder rb;
if (ontologyId == null || ontologyId.isEmpty()) {
rb = Response.status(BAD_REQUEST);
}
OWLOntologyID key = OntologyUtils.decode(ontologyId);
if (ontologyProvider.listOrphans().contains(key)) {
rb = Response.status(NO_CONTENT);
} else {
Graph o = getGraph(ontologyId, merged, uriInfo.getRequestUri());
rb = o == null ? Response.status(NOT_FOUND) : Response.ok(o);
}
// addCORSOrigin(servletContext, rb, headers);
return rb.build();
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class RootResource method getMetadata.
public Response getMetadata(@PathParam("ontologyId") String ontologyId, @Context UriInfo uriInfo, @Context HttpHeaders headers) {
ResponseBuilder rb;
org.apache.clerezza.commons.rdf.IRI me = new org.apache.clerezza.commons.rdf.IRI(getPublicBaseUri() + "ontonet/" + ontologyId);
Graph mImmutableGraph = new SimpleGraph();
for (String alias : getAliases(OntologyUtils.decode(ontologyId))) {
mImmutableGraph.add(new TripleImpl(new org.apache.clerezza.commons.rdf.IRI(getPublicBaseUri() + "ontonet/" + alias), OWL.sameAs, me));
}
rb = Response.ok(mImmutableGraph);
// addCORSOrigin(servletContext, rb, headers);
return rb.build();
}
Aggregations