use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.
the class CeliNamedEntityExtractionEnhancementEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (language == null) {
throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
// used for the palin literals in TextAnnotations
Language lang = new Language(language);
try {
List<NamedEntity> lista = this.client.extractEntities(text, language);
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
for (NamedEntity ne : lista) {
try {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
// add selected text as PlainLiteral in the language extracted from the text
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(ne.getFormKind(), lang)));
g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
if (ne.getFrom() != null && ne.getTo() != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(ne.getFrom().intValue())));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(ne.getTo().intValue())));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
}
} catch (NoConvertorException e) {
log.error(e.getMessage(), e);
}
}
} catch (IOException e) {
throw new EngineException("Error while calling the CELI NER (Named Entity Recognition)" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI NER (Named Entity Recognition) service!", e);
}
}
use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.
the class CeliLanguageIdentifierEnhancementEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
try {
String[] tmps = text.split(" ");
List<GuessedLanguage> lista = null;
if (tmps.length > 5)
lista = this.client.guessLanguage(text);
else
lista = this.client.guessQueryLanguage(text);
Graph g = ci.getMetadata();
// in ENHANCE_ASYNC we need to use read/write locks on the ContentItem
ci.getLock().writeLock().lock();
try {
GuessedLanguage gl = lista.get(0);
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(gl.getLang())));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(gl.getConfidence())));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
} finally {
ci.getLock().writeLock().unlock();
}
} catch (IOException e) {
throw new EngineException("Error while calling the CELI language" + " identifier service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI language identifier service!", e);
}
}
use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.
the class ZemantaEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.warn("ContentPart {} of ContentItem {} does not contain any text to enhance", contentPart.getKey(), ci.getUri());
return;
}
Graph graph = ci.getMetadata();
IRI ciId = ci.getUri();
// we need to store the results of Zemanta in an temp graph
Graph results = new SimpleGraph();
ZemantaAPIWrapper zemanta = new ZemantaAPIWrapper(key);
try {
results.addAll(zemanta.enhance(text));
} catch (IOException e) {
throw new EngineException("Unable to get Enhancement from remote Zemanta Service", e);
}
// now we need to process the results and convert them into the Enhancer
// annotation structure
ci.getLock().writeLock().lock();
try {
processRecognition(results, graph, text, ciId);
processCategories(results, graph, ciId);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.
the class NEREngineCore method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
// first check the langauge before processing the content (text)
String language = extractLanguage(ci);
if (language == null) {
throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
if (!isNerModel(language)) {
throw new IllegalStateException("For the language '" + language + "' of ContentItem " + ci.getUri() + " no NER model is configured: This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
// validate data in the AnalysedText
final String text;
if (at != null && at.getTokens().hasNext()) {
// if the AnalysedText is present and tokens are present
if (log.isDebugEnabled()) {
log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
}
text = null;
} else {
// no AnalysedText with tokens ...
// fallback to processing the plain text is still supported
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
// TODO: make the length of the data a field of the ContentItem
// interface to be able to filter out empty items in the canEnhance
// method
log.warn("ContentPart {} of ContentItem {} does not contain any text" + "to extract knowledge from in ContentItem {}", contentPart.getKey(), ci);
return;
}
if (log.isDebugEnabled()) {
log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}", new Object[] { contentPart.getKey(), ci.getUri().getUnicodeString(), StringUtils.abbreviate(text, 100) });
}
}
try {
if (config.isProcessedLangage(language)) {
for (String defaultModelType : config.getDefaultModelTypes()) {
TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
if (nameFinderModel == null) {
log.info("No NER Model for {} and language {} available!", defaultModelType, language);
} else {
findNamedEntities(ci, at, text, language, nameFinderModel);
}
}
}
// process for additional models
for (String additionalModel : config.getSpecificNerModles(language)) {
TokenNameFinderModel nameFinderModel;
try {
nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, additionalModel, null);
findNamedEntities(ci, at, text, language, nameFinderModel);
} catch (IOException e) {
log.warn("Unable to load TokenNameFinderModel model for language '" + language + "' (model: " + additionalModel + ")", e);
} catch (RuntimeException e) {
log.warn("Error while creating ChunkerModel for language '" + language + "' (model: " + additionalModel + ")", e);
}
}
} catch (Exception e) {
if (e instanceof RuntimeException) {
throw (RuntimeException) e;
} else {
throw new EngineException(this, ci, e);
}
}
}
use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.
the class CeliLemmatizerEnhancementEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String language = EnhancementEngineHelper.getLanguage(ci);
if (!isLangSupported(language)) {
throw new IllegalStateException("Call to computeEnhancement with unsupported language '" + language + " for ContentItem " + ci.getUri() + ": This is also checked " + "in the canEnhance method! -> This indicated an Bug in the " + "implementation of the " + "EnhancementJobManager!");
}
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
Graph graph = ci.getMetadata();
if (this.completeMorphoAnalysis) {
this.addMorphoAnalysisEnhancement(ci, text, language, graph);
} else {
this.addLemmatizationEnhancement(ci, text, language, graph);
}
}
Aggregations