use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class SmartcnTokenizerEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
if (!at.getSentences().hasNext()) {
// no sentences ... use this engine to detect
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
// now the tokens
TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
try {
tokens.reset();
while (tokens.incrementToken()) {
OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
Token t = at.addToken(offset.startOffset(), offset.endOffset());
log.trace("detected {}", t);
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method setup.
@BeforeClass
public static final void setup() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.') + 1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
Token stanbol = sent1.addToken(4, 11);
expectedTokens.put(stanbol, "Stanbol");
stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
// use index to create Tokens
int enhancerStart = sent1.getSpan().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
expectedTokens.put(enhancer, "enhancer");
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
MorphoFeatures morpho = new MorphoFeatures("enhance");
morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
morpho.addDefinitness(Definitness.Definite);
morpho.addPerson(Person.First);
morpho.addPos(new PosTag("PN", Pos.ProperNoun));
morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
morpho.addTense(new TenseTag("test-tense", Tense.Present));
morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
// create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class OpenNlpPosTaggingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
POSTagger posTagger = getPOSTagger(language);
if (posTagger == null) {
// the call to canEnhance and computeEnhancement
throw new EngineException("PosTagger for langauge '" + language + "is not available.");
}
TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
// for now only created to avoid checks for tagSet == null
// TODO: in future we might want to automatically create posModels based
// on tagged texts. However this makes no sense as long we can not
// persist TagSets.
tagSet = new TagSet<PosTag>("dummy", language);
}
// holds PosTags created for POS tags that where not part of the posModel
// (will hold all PosTags in case tagSet is NULL
Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PosTag>();
languageAdhocTags.put(language, adhocTags);
}
// (1) Sentence detection
// Try to read existing Sentence Annotations
Iterator<Sentence> sentences = at.getSentences();
List<Section> sentenceList;
if (!sentences.hasNext()) {
// if non try to detect sentences
log.trace(" > detect sentences for {}", at);
sentenceList = detectSentences(at, language);
}
if (sentences.hasNext()) {
// check if we have detected sentences
log.trace(" > use existing Sentence annotations for {}", at);
sentenceList = new ArrayList<Section>();
AnalysedTextUtils.appandToList(sentences, sentenceList);
} else {
// no sentence detected ... treat the whole text as a single sentence
// TODO: maybe apply here a limit to the text size!
log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
sentenceList = Collections.singletonList((Section) at);
}
// for all sentences (or the whole Text - if no sentences available)
for (Section sentence : sentenceList) {
// (2) Tokenize Sentences
List<Token> tokenList;
// check if there are already tokens
Iterator<Token> tokens = sentence.getTokens();
if (!tokens.hasNext()) {
// no tokens present -> tokenize
log.trace(" > tokenize {}", sentence);
tokenList = tokenize(sentence, language);
} else {
// use existing
log.trace(" > use existing Tokens for {}", sentence);
// ensure an ArrayList is used
tokenList = new ArrayList<Token>();
AnalysedTextUtils.appandToList(tokens, tokenList);
}
// (3) POS Tagging
posTag(tokenList, posTagger, tagSet, adhocTags, language);
}
if (log.isTraceEnabled()) {
logAnnotations(at);
}
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class OpenNlpSentenceDetectionEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
SentenceDetector sentenceDetector = getSentenceDetector(language);
if (sentenceDetector != null) {
for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
// detect sentences and add it to the AnalyzedText.
Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
log.trace(" > add {}", sentence);
}
} else {
log.warn("SentenceDetector model for language {} is no longer available. " + "This might happen if the model becomes unavailable during enhancement. " + "If this happens more often it might also indicate an bug in the used " + "EnhancementJobManager implementation as the availability is also checked " + "in the canEnhance(..) method of this Enhancement Engine.");
}
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class EntityCoReferenceEngineTest method testSpatialCoref.
@Test
public void testSpatialCoref() throws EngineException, IOException {
ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
Graph graph = ci.getMetadata();
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
int theStartIdx = sentence2.getSpan().indexOf("The");
int germanStartIdx = sentence2.getSpan().indexOf("German");
int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
engine.computeEnhancements(ci);
Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(representativeCorefValue);
CorefFeature representativeCoref = representativeCorefValue.value();
Assert.assertTrue(representativeCoref.isRepresentative());
Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(subordinateCorefValue);
CorefFeature subordinateCoref = subordinateCorefValue.value();
Assert.assertTrue(!subordinateCoref.isRepresentative());
Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Aggregations