use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class FstLinkingEngine method canEnhance.
@Override
public int canEnhance(ContentItem ci) throws EngineException {
log.trace("canEnhancer {}", ci.getUri());
String language = getLanguage(this, ci, false);
// (1) check if the language is enabled by the config
if (language == null || !indexConfig.getFstConfig().isLanguage(language)) {
log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", new Object[] { getName(), ci.getUri(), language });
return CANNOT_ENHANCE;
}
// (2) check if we have a FST model for the language
// NOTE: as STANBOL-1448 the index configuration is Solr index version
// dependent. This means that we can not use informations of the
// current IndexConfiguration to check if we have an FST model for
// the language of the requested document. Those information might
// be already out dated.
// if(indexConfig.getCorpus(language) == null && //for the language
// indexConfig.getDefaultCorpus() == null){ //a default model
// log.debug("Engine {} ignores ContentItem {} becuase no FST modles for language {} "
// + "are available", new Object[] {getName(), ci.getUri(), language});
// return CANNOT_ENHANCE;
// }
// we need a detected language, the AnalyzedText contentPart with
// Tokens.
AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
if (at == null) {
if (linkingMode == LinkingModeEnum.PLAIN) {
return NlpEngineHelper.getPlainText(this, ci, false) != null ? ENHANCE_ASYNC : CANNOT_ENHANCE;
} else {
log.warn("Unable to process {} with engine name={} and mode={} " + ": Missing AnalyzedText content part. Please ensure that " + "NLP processing results are available before FST linking!", new Object[] { ci, name, linkingMode });
return CANNOT_ENHANCE;
}
} else {
if (linkingMode == LinkingModeEnum.PLAIN) {
return ENHANCE_ASYNC;
} else if (at.getTokens().hasNext()) {
return ENHANCE_ASYNC;
} else {
log.warn("Unable to process {} with engine name={} and mode={} " + "as the AnalyzedText does not contain any Tokens!", new Object[] { ci, name, linkingMode });
return CANNOT_ENHANCE;
}
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class FstLinkingEngineTest method setupTest.
/**
* Initialises the {@link #ci} and {@link #content} fields for tests.
* It creates a ContentItem containing a '<code>plain/text</code>'
* {@link Blob} for the {@value #TEST_TEXT_FILE} and an {@link AnalysedText}
* filled with the NLP analysis results stored in
* {@link #TEST_TEXT_NLP_FILE}
* @return the {@link ContentItem} as used for the tests
* @throws IOException on any IO releated error while reading the test files
*/
@Before
public void setupTest() throws IOException {
// create a contentItem for the plain text used for testing
InputStream is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_FILE);
Assert.assertNotNull("Unable to load '" + TEST_TEXT_FILE + "' via classpath", is);
ContentItem ci = cif.createContentItem(new StreamSource(is, "text/plain"));
AnalysedText at = atf.createAnalysedText(ci, ci.getBlob());
is.close();
// parse the prepared NLP results and add it to the ContentItem
is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_NLP_FILE);
Assert.assertNotNull("Unable to load '" + TEST_TEXT_NLP_FILE + "' via classpath", is);
AnalyzedTextParser.getDefaultInstance().parse(is, Charset.forName("UTF-8"), at);
is.close();
// set the language of the contentItem
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, EN_LANGUAGE));
// set the contentItem and also the content
this.ci = ci;
this.content = at.getText().toString();
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class SmartcnTokenizerEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
if (!at.getSentences().hasNext()) {
// no sentences ... use this engine to detect
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
// now the tokens
TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
try {
tokens.reset();
while (tokens.incrementToken()) {
OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
Token t = at.addToken(offset.startOffset(), offset.endOffset());
log.trace("detected {}", t);
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class SentimentSummarizationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String language = NlpEngineHelper.getLanguage(this, ci, true);
AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
// configure the spanTypes based on the configuration
// EnumSet<Span.SpanTypeEnum> spanTypes = EnumSet.noneOf(SpanTypeEnum.class);
// if(writeSentimentPhrases){
// spanTypes.add(SpanTypeEnum.Chunk);
// }
// if(writeSentencesSentimet){
// spanTypes.add(SpanTypeEnum.Sentence);
// }
// if(writeTextSectionSentiments){
// spanTypes.add(SpanTypeEnum.TextSection);
// }
// if(writeTextSentiments ){
// spanTypes.add(SpanTypeEnum.Text);
// }
List<SentimentPhrase> sentiments = extractSentiments(at, language);
String detectedLang = EnhancementEngineHelper.getLanguage(ci);
ci.getLock().writeLock().lock();
try {
writeSentimentEnhancements(ci, sentiments, at, detectedLang == null ? null : new Language(detectedLang));
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
// Add some Tokens with POS annotations to test the usage of
// existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
Aggregations