use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class MetaxaEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
// get model from the extraction
URIImpl docId;
Model m = null;
ci.getLock().readLock().lock();
try {
docId = new URIImpl(ci.getUri().getUnicodeString());
m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} catch (IOException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} finally {
ci.getLock().readLock().unlock();
}
// the extracted plain text from the model
if (null == m) {
log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
return;
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink("text/plain");
} catch (IOException e) {
m.close();
throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
}
HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
RDF2GoUtils.urifyBlankNodes(m);
ClosableIterator<Statement> it = m.iterator();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
// used to detect if some text was extracted
boolean textExtracted = false;
try {
// first add to a temporary graph
Graph g = new SimpleGraph();
while (it.hasNext()) {
Statement oneStmt = it.next();
// the plain text Blob!
if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
String text = oneStmt.getObject().toString();
if (text != null && !text.isEmpty()) {
try {
out.write(oneStmt.getObject().toString());
} catch (IOException e) {
throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
}
textExtracted = true;
if (includeText) {
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
g.add(new TripleImpl(subject, predicate, object));
}
}
} else {
// add metadata to the metadata of the contentItem
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject && null != predicate && null != object) {
Triple t = new TripleImpl(subject, predicate, object);
g.add(t);
log.debug("added " + t.toString());
}
}
}
// add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
ci.getMetadata().addAll(g);
g = null;
} finally {
ci.getLock().writeLock().unlock();
}
} finally {
it.close();
m.close();
IOUtils.closeQuietly(out);
}
if (textExtracted) {
// add plain text to the content item
IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
ci.addPart(blobUri, plainTextSink.getBlob());
}
}
Aggregations