use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class TikaEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
MediaTypeAndStream mtas = extractMediaType(ci);
if (mtas.mediaType == null) {
//unable to parse and detect content type
return;
}
MediaType plainMediaType = mtas.mediaType.getBaseType();
if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
//we need not to process plain text!
return;
}
final ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<MediaType> supproted = parser.getSupportedTypes(context);
if (supproted.contains(plainMediaType)) {
final InputStream in;
if (mtas.in == null) {
in = ci.getStream();
} else {
in = mtas.in;
}
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if (charset != null) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
} catch (IOException e) {
//close the input stream
IOUtils.closeQuietly(in);
throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new //only the Body
BodyContentHandler(//skip ignoreable
new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if (!plainMediaType.equals(XHTML)) {
//do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding " + UTF8, e);
}
mainHandler = new MultiHandler(textHandler, xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
} finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
} else {
//runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally {
//ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if (xhtmlSink != null) {
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
IRI textBlobUri = new IRI("urn:tika:text:" + random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if (xhtmlHandler != null) {
IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if (log.isInfoEnabled()) {
for (String name : metadata.names()) {
log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
}
}
ci.getLock().writeLock().lock();
try {
Graph graph = ci.getMetadata();
IRI id = ci.getUri();
Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
if (includeUnmappedProperties) {
Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
unmapped.removeAll(mapped);
for (String name : unmapped) {
if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
//only mapped
IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
for (String value : metadata.getValues(name)) {
//TODO: without the Property for the name we have no datatype
// information ... so we add PlainLiterals for now
graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
}
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
//else not supported format
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class TestOpenCalaisEngine method testCalaisConnection.
@Test
public void testCalaisConnection() throws IOException, EngineException {
Assume.assumeNotNull(calaisExtractor.getLicenseKey());
ContentItem ci = wrapAsContentItem(TEST_TEXT);
ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, LiteralFactory.getInstance().createTypedLiteral("en")));
Graph model;
try {
model = calaisExtractor.getCalaisAnalysis(TEST_TEXT, "text/plain");
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
Assert.assertNotNull("No model", model);
Collection<CalaisEntityOccurrence> entities;
try {
entities = calaisExtractor.queryModel(model);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
LOG.info("Found entities: {}", entities.size());
LOG.debug("Entities:\n{}", entities);
Assert.assertFalse("No entities found!", entities.isEmpty());
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class EntityLinker method lookupEntities.
/**
* Searches for Entities in the {@link #entitySearcher} corresponding to the
* {@link Token#getText() words} of the current {@link #state position} in
* the text.
* @param searchStrings the list of {@link Token#getText() words} to search
* entities for.
* @return The sorted list with the suggestions.
* If there are no suggestions an empty list will be returned.
*/
private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
Collection<? extends Representation> results;
try {
results = entitySearcher.lookup(config.getNameField(), config.getSelectedFields(), searchStrings, state.getSentence().getLanguage(), config.getDefaultLanguage());
} catch (RuntimeException e) {
throw new EngineException(e.getMessage(), e);
}
List<Suggestion> suggestions = new ArrayList<Suggestion>();
for (Representation result : results) {
Suggestion match = matchLabels(result);
if (match.getMatch() != MATCH.NONE) {
suggestions.add(match);
}
}
//sort the suggestions
if (suggestions.size() > 1) {
Collections.sort(suggestions, Suggestion.DEFAULT_SUGGESTION_COMPARATOR);
}
//remove all elements > config.getMaxSuggestions()
return suggestions;
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
//start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
//build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
//Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
//NER data
List<NerData> nerList = new ArrayList<NerData>();
//the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
//required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
//Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
//Sentence detection by POS tag
if (sentStartOffset < 0) {
//the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
//add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
//POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
//NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
//write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
//NOTE that the fise:TextAnnotation are written later based on the nerList
//clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
//and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
//if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
//we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
//and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
//no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
//finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class FstLinkingEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at;
if (linkingMode != LinkingModeEnum.PLAIN) {
//require AnalysedText contentPart
at = getAnalysedText(this, ci, true);
} else {
//AnalysedText is optional in LinkingModeEnum.BASIC
try {
at = AnalysedTextUtils.getAnalysedText(ci);
} catch (ClassCastException e) {
//unexpected contentPart found under the URI expecting the AnalysedText
at = null;
}
}
final String content;
if (at != null) {
//we can get the content from the Analyzed text
content = at.getSpan();
} else {
//no analyzed text ... read is from the text/plain blob
try {
content = ContentItemHelper.getText(NlpEngineHelper.getPlainText(this, ci, true).getValue());
} catch (IOException e) {
throw new EngineException(this, ci, "Unable to access plain/text content!", e);
}
}
log.debug(" > AnalysedText {}", at);
String language = getLanguage(this, ci, true);
log.debug(" > Language {}", language);
if (log.isDebugEnabled()) {
log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(content, 100) });
}
// TODO: we need to do the same for the the default matching language
TaggingSession session;
try {
session = TaggingSession.createSession(indexConfig, language);
} catch (CorpusException e) {
throw new EngineException(this, ci, e);
}
if (!session.hasCorpus()) {
//no corpus available for processing the request
return;
}
long taggingStart = System.currentTimeMillis();
final NavigableMap<int[], Tag> tags = new TreeMap<int[], Tag>(Tag.SPAN_COMPARATOR);
try {
//process the language of the document
Corpus corpus = null;
if (session.getLanguageCorpus() != null) {
corpus = session.getLanguageCorpus();
long t = System.currentTimeMillis();
int d = tag(content, at, session, corpus, tags);
log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { corpus.getIndexedField(), System.currentTimeMillis() - t, d });
}
if (session.getDefaultCorpus() != null) {
if (corpus == null) {
corpus = session.getDefaultCorpus();
}
long t = System.currentTimeMillis();
int d = tag(content, at, session, session.getDefaultCorpus(), tags);
log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { session.getDefaultCorpus().getIndexedField(), System.currentTimeMillis() - t, d });
}
long taggingEnd = System.currentTimeMillis();
if (corpus == null) {
throw new EngineException(this, ci, "No FST corpus found to process contentItem " + "language '" + session.getLanguage() + "'!", null);
} else {
if (session.getLanguageCorpus() != null && session.getDefaultCorpus() != null) {
log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
}
}
int matches = match(content, tags.values(), session.entityMentionTypes);
log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms", new Object[] { matches, session.getSessionDocLoaded(), session.getSessionDocCached(), session.getSessionDocAppended(), System.currentTimeMillis() - taggingEnd });
if (log.isDebugEnabled() && session.getDocumentCache() != null) {
log.debug("EntityCache Statistics: {}", session.getDocumentCache().printStatistics());
}
} catch (IOException e) {
throw new EngineException(this, ci, e);
} finally {
session.close();
}
if (log.isTraceEnabled()) {
log.trace("Tagged Entities:");
for (Tag tag : tags.values()) {
log.trace("[{},{}]: {}", new Object[] { tag.getStart(), tag.getEnd(), tag.getMatches() });
}
}
ci.getLock().writeLock().lock();
try {
writeEnhancements(ci, content, tags.values(), language, elConfig.isWriteEntityRankings());
} finally {
ci.getLock().writeLock().unlock();
}
//help the GC
tags.clear();
}
Aggregations