use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
//start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
//build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
//Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
//NER data
List<NerData> nerList = new ArrayList<NerData>();
//the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
//required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
//Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
//Sentence detection by POS tag
if (sentStartOffset < 0) {
//the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
//add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
//POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
//NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
//write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
//NOTE that the fise:TextAnnotation are written later based on the nerList
//clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
//and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
//if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
//we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
//and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
//no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
//finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class TestLocationEnhancementEngine method testLocationEnhancementEngine.
@Test
public void testLocationEnhancementEngine() throws IOException, EngineException {
//create a content item
ContentItem ci = getContentItem("urn:org.apache:stanbol.enhancer:text:content-item:person", CONTEXT);
//add three text annotations to be consumed by this test
getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
//perform the computation of the enhancements
try {
locationEnhancementEngine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e, "overloaded with requests");
return;
}
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(locationEnhancementEngine.getClass().getName()));
//adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
/*
* Note:
* - Expected results depend on the geonames.org data. So if the test
* fails it may also mean that the data provided by geonames.org have
* changed
*/
int entityAnnotationCount = validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
//two suggestions for New Zealand and one hierarchy entry for the first
//suggestion
//NOTE 2012-10-10: changed expected value back to "3" as geonames.org
// again returns "Oceania" as parent for "New Zealand"
//NOTE: 2012-11-12: deactivated this check, because this the fact that
// "Oceania" is returned as parent for "New Zealand" changes every
// every view weeks
//assertEquals(3, entityAnnotationCount);
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class HtmlExtractorEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
Graph model = new SimpleGraph();
ci.getLock().readLock().lock();
try {
extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), model);
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", e);
} finally {
ci.getLock().readLock().unlock();
}
ClerezzaRDFUtils.urifyBlankNodes(model);
// make the model single rooted
if (singleRootRdf) {
ClerezzaRDFUtils.makeConnected(model, ci.getUri(), new IRI(NIE_NS + "contains"));
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
LOG.info("Model: {}", model);
ci.getMetadata().addAll(model);
model = null;
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class NamedEntityTaggingEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
final Site site;
if (referencedSiteID != null) {
// lookup the referenced site
site = siteManager.getSite(referencedSiteID);
// ensure that it is present
if (site == null) {
String msg = String.format("Unable to enhance %s because Referenced Site %s is currently not active!", ci.getUri().getUnicodeString(), referencedSiteID);
log.warn(msg);
// throw new EngineException(msg);
return;
}
// and that it supports offline mode if required
if (isOfflineMode() && !site.supportsLocalMode()) {
log.warn("Unable to enhance ci {} because OfflineMode is not supported by ReferencedSite {}.", ci.getUri().getUnicodeString(), site.getId());
return;
}
} else {
// null indicates to use the Entityhub to lookup Entities
site = null;
}
Graph graph = ci.getMetadata();
LiteralFactory literalFactory = LiteralFactory.getInstance();
// Retrieve the existing text annotations (requires read lock)
Map<NamedEntity, List<IRI>> textAnnotations = new HashMap<NamedEntity, List<IRI>>();
// the language extracted for the parsed content or NULL if not
// available
String contentLangauge;
ci.getLock().readLock().lock();
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
IRI uri = (IRI) it.next().getSubject();
if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
// skip
continue;
}
NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri);
if (namedEntity != null) {
// This is a first occurrence, collect any subsumed
// annotations
List<IRI> subsumed = new ArrayList<IRI>();
for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext(); ) {
subsumed.add((IRI) it2.next().getSubject());
}
textAnnotations.put(namedEntity, subsumed);
}
}
} finally {
ci.getLock().readLock().unlock();
}
// search the suggestions
Map<NamedEntity, List<Suggestion>> suggestions = new HashMap<NamedEntity, List<Suggestion>>(textAnnotations.size());
for (Entry<NamedEntity, List<IRI>> entry : textAnnotations.entrySet()) {
try {
List<Suggestion> entitySuggestions = computeEntityRecommentations(site, entry.getKey(), entry.getValue(), contentLangauge);
if (entitySuggestions != null && !entitySuggestions.isEmpty()) {
suggestions.put(entry.getKey(), entitySuggestions);
}
} catch (EntityhubException e) {
throw new EngineException(this, ci, e);
}
}
// now write the results (requires write lock)
ci.getLock().writeLock().lock();
try {
RdfValueFactory factory = RdfValueFactory.getInstance();
Map<String, Representation> entityData = new HashMap<String, Representation>();
for (Entry<NamedEntity, List<Suggestion>> entitySuggestions : suggestions.entrySet()) {
List<IRI> subsumed = textAnnotations.get(entitySuggestions.getKey());
List<BlankNodeOrIRI> annotationsToRelate = new ArrayList<BlankNodeOrIRI>(subsumed);
annotationsToRelate.add(entitySuggestions.getKey().getEntity());
for (Suggestion suggestion : entitySuggestions.getValue()) {
log.debug("Add Suggestion {} for {}", suggestion.getEntity().getId(), entitySuggestions.getKey());
EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(), annotationsToRelate, suggestion, nameField, // header)?!
contentLangauge == null ? DEFAULT_LANGUAGE : contentLangauge);
if (dereferenceEntities) {
entityData.put(suggestion.getEntity().getId(), suggestion.getEntity().getRepresentation());
}
}
}
// Representations to add! If false entityData will be empty
for (Representation rep : entityData.values()) {
graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class NlpEngineHelper method initAnalysedText.
/**
* Retrieves - or if not present - creates the {@link AnalysedText} content
* part for the parsed {@link ContentItem}. If the {@link Blob} with the
* mime type '<code>text/plain</code>' is present this method
* throws an {@link IllegalStateException} (this method internally uses
* {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
* <code>true</code> as third parameters. Users of this method should call
* this method with <code>false</code> as third parameter in their
* {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
* <i>NOTE:</i> This method is intended for Engines that want to create an
* empty {@link AnalysedText} content part. Engines that assume that this
* content part is already present (e.g. if the consume already existing
* annotations) should use the
* {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
* method instead.
* @param engine the EnhancementEngine calling this method (used for logging)
* @param analysedTextFactory the {@link AnalysedTextFactory} used to create
* the {@link AnalysedText} instance (if not present).
* @param ci the {@link ContentItem}
* @return the AnalysedText
* @throws EngineException on any exception while accessing the
* '<code>text/plain</code>' Blob
* @throws IllegalStateException if no '<code>text/plain</code>' Blob is
* present as content part of the parsed {@link ContentItem} or the parsed
* {@link AnalysedTextFactory} is <code>null</code>. <i>NOTE</i> that
* {@link IllegalStateException} are only thrown if the {@link AnalysedText}
* ContentPart is not yet present in the parsed {@link ContentItem}
*/
public static AnalysedText initAnalysedText(EnhancementEngine engine, AnalysedTextFactory analysedTextFactory, ContentItem ci) throws EngineException {
AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
if (at == null) {
if (analysedTextFactory == null) {
throw new IllegalStateException("Unable to initialise AnalysedText" + "ContentPart because the parsed AnalysedTextFactory is NULL");
}
Entry<IRI, Blob> textBlob = getPlainText(engine, ci, true);
//we need to create
ci.getLock().writeLock().lock();
try {
//try again to retrieve (maybe an concurrent thread has created
//the content part in the meantime
at = AnalysedTextUtils.getAnalysedText(ci);
if (at == null) {
log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
}
} catch (IOException e) {
throw new EngineException("Unable to create AnalysetText instance for Blob " + textBlob.getKey() + " of ContentItem " + ci.getUri() + "!", e);
} finally {
ci.getLock().writeLock().unlock();
}
} else {
log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
}
return at;
}
Aggregations