use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
//Add some Tokens with POS annotations to test the usage of
//existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
//compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
//deactivate test
return;
}
//now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
for (LexicalEntry term : terms) {
if (term.getTermReadings().isEmpty()) {
//ignore terms without readings
continue;
}
//Add the LexicalEntry as Token to the Text. NOTE that if a
//Token with the same start/end positions already exist this
//Method returns the existing instance
Token token = at.addToken(term.getFrom(), term.getTo());
//Now try to get POS annotations for the Token
for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
if (posAnno.value().isMapped()) {
for (LexicalCategory cat : posAnno.value().getCategories()) {
if (!tokenLexCats.containsKey(cat)) {
//do not override with lover prob
tokenLexCats.put(cat, posAnno.probability());
}
}
}
}
for (Reading reading : term.getTermReadings()) {
MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
//add the readings (MorphoFeatures)
if (mf != null) {
//use the POS tags of the morpho analysis and compare it
//with existing POS tags.
double posProbability = -1;
Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
for (PosTag mfPos : mf.getPosList()) {
mfCats.addAll(mfPos.getCategories());
}
for (LexicalCategory mfCat : mfCats) {
Double prob = tokenLexCats.get(mfCat);
if (prob != null && posProbability < prob) {
posProbability = prob;
}
}
//add the morpho features with the posProbabiliy
Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
}
}
}
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class OpenNlpChunkingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfiguration, language, true);
ChunkerME chunker = initChunker(language);
if (chunker == null) {
return;
}
//init the Phrase TagSet
TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
}
if (tagSet == null) {
log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
//for now only created to avoid checks for tagSet == null
//TODO: in future we might want to automatically create posModels based
//on tagged texts. However this makes no sense as long we can not
//persist TagSets.
tagSet = new TagSet<PhraseTag>("dummy", language);
}
//holds PosTags created for POS tags that where not part of the posModel
//(will hold all PosTags in case tagSet is NULL
Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PhraseTag>();
languageAdhocTags.put(language, adhocTags);
}
ci.getLock().writeLock().lock();
try {
Iterator<? extends Section> sentences = at.getSentences();
if (!sentences.hasNext()) {
//no sentences ... iterate over the whole text
sentences = Collections.singleton(at).iterator();
}
List<String> tokenTextList = new ArrayList<String>(64);
List<String> posList = new ArrayList<String>(64);
List<Token> tokenList = new ArrayList<Token>(64);
//process each sentence seperatly
while (sentences.hasNext()) {
// (1) get Tokens and POS information for the sentence
Section sentence = sentences.next();
Iterator<Token> tokens = sentence.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
tokenList.add(token);
tokenTextList.add(token.getSpan());
Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
if (posValue == null) {
throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
} else {
posList.add(posValue.value().getTag());
}
}
String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
if (log.isTraceEnabled()) {
log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
}
//free memory
tokenTextList.clear();
//free memory
posList.clear();
// (2) Chunk the sentence
String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
double[] chunkProb = chunker.probs();
if (log.isTraceEnabled()) {
log.trace("Chunks: {}" + Arrays.toString(chunkTags));
}
//free memory
tokenStrings = null;
//free memory
tokenPos = null;
// (3) Process the results and write the Annotations
double chunkProps = 0;
int chunkTokenCount = 0;
PhraseTag tag = null;
int i;
/*
* This assumes:
* - 'B-{tag}' ... for start of a new chunk
* - '???' ... anything other for continuing the current chunk
* - 'O' ... no chunk (ends current chunk)
*/
for (i = 0; i < tokenList.size(); i++) {
boolean start = chunkTags[i].charAt(0) == 'B';
boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
if (end) {
//add the current phrase
//add at AnalysedText level, because offsets are absolute
//NOTE we are already at the next token when we detect the end
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
//reset the state
tag = null;
chunkTokenCount = 0;
chunkProps = 0;
}
if (start) {
//create the new tag
tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), //skip 'B-'
language);
}
if (tag != null) {
//count this token for the current chunk
chunkProps = chunkProps + chunkProb[i];
chunkTokenCount++;
}
}
if (tag != null) {
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
}
// (4) clean up
tokenList.clear();
}
} finally {
ci.getLock().writeLock().unlock();
}
if (log.isTraceEnabled()) {
logChunks(at);
}
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class NEREngineCore method extractNameOccurrences.
/**
* THis method extracts NamedEntity occurrences by using existing {@link Token}s and
* {@link Sentence}s in the parsed {@link AnalysedText}.
* @param nameFinderModel the model used to find NamedEntities
* @param at the Analysed Text
* @param language the language of the text
* @return the found named Entity Occurrences
*/
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
NameFinderME finder = new NameFinderME(nameFinderModel);
Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
List<Section> sentences = new ArrayList<Section>();
//Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
AnalysedTextUtils.appandToList(at.getSentences(), sentences);
if (sentences.isEmpty()) {
//no sentence annotations
//process as a single section
sentences.add(at);
}
for (int i = 0; i < sentences.size(); i++) {
String sentence = sentences.get(i).getSpan();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
contextElements.add(sentence);
//three sentences as context
String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());
// get the tokens, words of the current sentence
List<Token> tokens = new ArrayList<Token>(32);
List<String> words = new ArrayList<String>(32);
for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext(); ) {
Token t = it.next();
tokens.add(t);
words.add(t.getSpan());
}
Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
double[] probs = finder.probs();
//int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd());
Double confidence = 1.0;
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
}
int start = tokens.get(nameSpans[j].getStart()).getStart();
int end = start + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
//create the occurrence for writing fise:TextAnnotations
NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
//add also the NerAnnotation to the AnalysedText
Chunk chunk = at.addChunk(start, end);
//TODO: build AnnotationModel based on the configured Mappings
chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class OpenNlpPosTaggingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
POSTagger posTagger = getPOSTagger(language);
if (posTagger == null) {
//the call to canEnhance and computeEnhancement
throw new EngineException("PosTagger for langauge '" + language + "is not available.");
}
TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
//for now only created to avoid checks for tagSet == null
//TODO: in future we might want to automatically create posModels based
//on tagged texts. However this makes no sense as long we can not
//persist TagSets.
tagSet = new TagSet<PosTag>("dummy", language);
}
//holds PosTags created for POS tags that where not part of the posModel
//(will hold all PosTags in case tagSet is NULL
Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PosTag>();
languageAdhocTags.put(language, adhocTags);
}
//(1) Sentence detection
//Try to read existing Sentence Annotations
Iterator<Sentence> sentences = at.getSentences();
List<Section> sentenceList;
if (!sentences.hasNext()) {
//if non try to detect sentences
log.trace(" > detect sentences for {}", at);
sentenceList = detectSentences(at, language);
}
if (sentences.hasNext()) {
//check if we have detected sentences
log.trace(" > use existing Sentence annotations for {}", at);
sentenceList = new ArrayList<Section>();
AnalysedTextUtils.appandToList(sentences, sentenceList);
} else {
//no sentence detected ... treat the whole text as a single sentence
//TODO: maybe apply here a limit to the text size!
log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
sentenceList = Collections.singletonList((Section) at);
}
//for all sentences (or the whole Text - if no sentences available)
for (Section sentence : sentenceList) {
//(2) Tokenize Sentences
List<Token> tokenList;
//check if there are already tokens
Iterator<Token> tokens = sentence.getTokens();
if (!tokens.hasNext()) {
//no tokens present -> tokenize
log.trace(" > tokenize {}", sentence);
tokenList = tokenize(sentence, language);
} else {
//use existing
log.trace(" > use existing Tokens for {}", sentence);
//ensure an ArrayList is used
tokenList = new ArrayList<Token>();
AnalysedTextUtils.appandToList(tokens, tokenList);
}
//(3) POS Tagging
posTag(tokenList, posTagger, tagSet, adhocTags, language);
}
if (log.isTraceEnabled()) {
logAnnotations(at);
}
}
Aggregations