use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class XmpExtractorEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
InputStream in = ci.getBlob().getStream();
XMPPacketScanner scanner = new XMPPacketScanner();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try {
scanner.parse(in, baos);
} catch (IOException e) {
throw new EngineException(e);
}
byte[] bytes = baos.toByteArray();
if (bytes.length > 0) {
Graph model = new IndexedGraph();
parser.parse(model, new ByteArrayInputStream(bytes), "application/rdf+xml");
GraphNode gn = new GraphNode(new IRI("http://relative-uri.fake/"), model);
gn.replaceWith(ci.getUri());
ci.getLock().writeLock().lock();
try {
LOG.info("Model: {}", model);
ci.getMetadata().addAll(model);
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class OpenCalaisEngine method getCalaisAnalysis.
/**
* Retrieves the annotations from OpenCalais as RDF/XML. From that an Graph is created.
*
* @param text the text to send to OpenCalais
*
* @return an Graph with all annotations
*
* @throws EngineException
*/
public Graph getCalaisAnalysis(String text, String mimeType) throws EngineException {
if (mimeType.equals("text/plain")) {
mimeType = "text/raw";
}
String calaisParams = "<c:params xmlns:c=\"http://s.opencalais.com/1/pred/\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">" + "<c:processingDirectives c:contentType=\"" + mimeType + "\" " + // "c:enableMetadataType=\"GenericRelations\" "+
"c:outputFormat=\"rdf/xml\" " + // NOTE (rw, 2012-05-29) changed to true while working on STANBOL-630
"c:calculateRelevanceScore=\"true\" " + "c:omitOutputtingOriginalText=\"true\"" + ">" + "</c:processingDirectives>" + "</c:params>";
Graph model = null;
try {
StringBuilder postParams = new StringBuilder();
postParams.append("licenseID=").append(URLEncoder.encode(getLicenseKey(), "UTF-8")).append("&content=").append(URLEncoder.encode(text, "UTF-8")).append("¶msXML=").append(URLEncoder.encode(calaisParams, "UTF-8"));
// get annotations from Calais
log.info("Calais request sent");
String calaisResult = doPostRequest(this.getCalaisUrl(), null, postParams.toString(), "application/x-www-form-urlencoded", "UTF-8");
log.info("Calais response received: {}", calaisResult.length());
log.info("Calais response:\n {}", calaisResult);
log.debug("Calais data:\n{}", calaisResult);
// build model from Calais result
InputStream in = new ByteArrayInputStream(calaisResult.getBytes("utf-8"));
model = readModel(in, "application/rdf+xml");
} catch (UnsupportedEncodingException e) {
throw new EngineException(e.getMessage(), e);
} catch (IOException e) {
throw new EngineException(e.getMessage(), e);
}
return model;
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class FstLinkingEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at;
if (linkingMode != LinkingModeEnum.PLAIN) {
// require AnalysedText contentPart
at = getAnalysedText(this, ci, true);
} else {
// AnalysedText is optional in LinkingModeEnum.BASIC
try {
at = AnalysedTextUtils.getAnalysedText(ci);
} catch (ClassCastException e) {
// unexpected contentPart found under the URI expecting the AnalysedText
at = null;
}
}
final String content;
if (at != null) {
// we can get the content from the Analyzed text
content = at.getSpan();
} else {
// no analyzed text ... read is from the text/plain blob
try {
content = ContentItemHelper.getText(NlpEngineHelper.getPlainText(this, ci, true).getValue());
} catch (IOException e) {
throw new EngineException(this, ci, "Unable to access plain/text content!", e);
}
}
log.debug(" > AnalysedText {}", at);
String language = getLanguage(this, ci, true);
log.debug(" > Language {}", language);
if (log.isDebugEnabled()) {
log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(content, 100) });
}
// TODO: we need to do the same for the the default matching language
TaggingSession session;
try {
session = TaggingSession.createSession(indexConfig, language);
} catch (CorpusException e) {
throw new EngineException(this, ci, e);
}
if (!session.hasCorpus()) {
// no corpus available for processing the request
return;
}
long taggingStart = System.currentTimeMillis();
final NavigableMap<int[], Tag> tags = new TreeMap<int[], Tag>(Tag.SPAN_COMPARATOR);
try {
// process the language of the document
Corpus corpus = null;
if (session.getLanguageCorpus() != null) {
corpus = session.getLanguageCorpus();
long t = System.currentTimeMillis();
int d = tag(content, at, session, corpus, tags);
log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { corpus.getIndexedField(), System.currentTimeMillis() - t, d });
}
if (session.getDefaultCorpus() != null) {
if (corpus == null) {
corpus = session.getDefaultCorpus();
}
long t = System.currentTimeMillis();
int d = tag(content, at, session, session.getDefaultCorpus(), tags);
log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { session.getDefaultCorpus().getIndexedField(), System.currentTimeMillis() - t, d });
}
long taggingEnd = System.currentTimeMillis();
if (corpus == null) {
throw new EngineException(this, ci, "No FST corpus found to process contentItem " + "language '" + session.getLanguage() + "'!", null);
} else {
if (session.getLanguageCorpus() != null && session.getDefaultCorpus() != null) {
log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
}
}
int matches = match(content, tags.values(), session.entityMentionTypes);
log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms", new Object[] { matches, session.getSessionDocLoaded(), session.getSessionDocCached(), session.getSessionDocAppended(), System.currentTimeMillis() - taggingEnd });
if (log.isDebugEnabled() && session.getDocumentCache() != null) {
log.debug("EntityCache Statistics: {}", session.getDocumentCache().printStatistics());
}
} catch (IOException e) {
throw new EngineException(this, ci, e);
} finally {
session.close();
}
if (log.isTraceEnabled()) {
log.trace("Tagged Entities:");
for (Tag tag : tags.values()) {
log.trace("[{},{}]: {}", new Object[] { tag.getStart(), tag.getEnd(), tag.getMatches() });
}
}
ci.getLock().writeLock().lock();
try {
writeEnhancements(ci, content, tags.values(), language, elConfig.isWriteEntityRankings());
} finally {
ci.getLock().writeLock().unlock();
}
// help the GC
tags.clear();
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class TestOpenCalaisEngine method testCalaisConnection.
@Test
public void testCalaisConnection() throws IOException, EngineException {
Assume.assumeNotNull(calaisExtractor.getLicenseKey());
ContentItem ci = wrapAsContentItem(TEST_TEXT);
ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, LiteralFactory.getInstance().createTypedLiteral("en")));
Graph model;
try {
model = calaisExtractor.getCalaisAnalysis(TEST_TEXT, "text/plain");
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
Assert.assertNotNull("No model", model);
Collection<CalaisEntityOccurrence> entities;
try {
entities = calaisExtractor.queryModel(model);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
LOG.info("Found entities: {}", entities.size());
LOG.debug("Entities:\n{}", entities);
Assert.assertFalse("No entities found!", entities.isEmpty());
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class MetaxaEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
// get model from the extraction
URIImpl docId;
Model m = null;
ci.getLock().readLock().lock();
try {
docId = new URIImpl(ci.getUri().getUnicodeString());
m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} catch (IOException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
} finally {
ci.getLock().readLock().unlock();
}
// the extracted plain text from the model
if (null == m) {
log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
return;
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink("text/plain");
} catch (IOException e) {
m.close();
throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
}
HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
RDF2GoUtils.urifyBlankNodes(m);
ClosableIterator<Statement> it = m.iterator();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
// used to detect if some text was extracted
boolean textExtracted = false;
try {
// first add to a temporary graph
Graph g = new SimpleGraph();
while (it.hasNext()) {
Statement oneStmt = it.next();
// the plain text Blob!
if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
String text = oneStmt.getObject().toString();
if (text != null && !text.isEmpty()) {
try {
out.write(oneStmt.getObject().toString());
} catch (IOException e) {
throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
}
textExtracted = true;
if (includeText) {
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
g.add(new TripleImpl(subject, predicate, object));
}
}
} else {
// add metadata to the metadata of the contentItem
BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject && null != predicate && null != object) {
Triple t = new TripleImpl(subject, predicate, object);
g.add(t);
log.debug("added " + t.toString());
}
}
}
// add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
ci.getMetadata().addAll(g);
g = null;
} finally {
ci.getLock().writeLock().unlock();
}
} finally {
it.close();
m.close();
IOUtils.closeQuietly(out);
}
if (textExtracted) {
// add plain text to the content item
IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
ci.addPart(blobUri, plainTextSink.getBlob());
}
}
Aggregations