use of opennlp.tools.postag.POSTagger in project stanbol by apache.
the class OpenNlpPosTaggingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
POSTagger posTagger = getPOSTagger(language);
if (posTagger == null) {
//the call to canEnhance and computeEnhancement
throw new EngineException("PosTagger for langauge '" + language + "is not available.");
}
TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
//for now only created to avoid checks for tagSet == null
//TODO: in future we might want to automatically create posModels based
//on tagged texts. However this makes no sense as long we can not
//persist TagSets.
tagSet = new TagSet<PosTag>("dummy", language);
}
//holds PosTags created for POS tags that where not part of the posModel
//(will hold all PosTags in case tagSet is NULL
Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PosTag>();
languageAdhocTags.put(language, adhocTags);
}
//(1) Sentence detection
//Try to read existing Sentence Annotations
Iterator<Sentence> sentences = at.getSentences();
List<Section> sentenceList;
if (!sentences.hasNext()) {
//if non try to detect sentences
log.trace(" > detect sentences for {}", at);
sentenceList = detectSentences(at, language);
}
if (sentences.hasNext()) {
//check if we have detected sentences
log.trace(" > use existing Sentence annotations for {}", at);
sentenceList = new ArrayList<Section>();
AnalysedTextUtils.appandToList(sentences, sentenceList);
} else {
//no sentence detected ... treat the whole text as a single sentence
//TODO: maybe apply here a limit to the text size!
log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
sentenceList = Collections.singletonList((Section) at);
}
//for all sentences (or the whole Text - if no sentences available)
for (Section sentence : sentenceList) {
//(2) Tokenize Sentences
List<Token> tokenList;
//check if there are already tokens
Iterator<Token> tokens = sentence.getTokens();
if (!tokens.hasNext()) {
//no tokens present -> tokenize
log.trace(" > tokenize {}", sentence);
tokenList = tokenize(sentence, language);
} else {
//use existing
log.trace(" > use existing Tokens for {}", sentence);
//ensure an ArrayList is used
tokenList = new ArrayList<Token>();
AnalysedTextUtils.appandToList(tokens, tokenList);
}
//(3) POS Tagging
posTag(tokenList, posTagger, tagSet, adhocTags, language);
}
if (log.isTraceEnabled()) {
logAnnotations(at);
}
}
use of opennlp.tools.postag.POSTagger in project stanbol by apache.
the class OpenNLPTest method testLoadEnPOS.
@Test
public void testLoadEnPOS() throws IOException {
POSModel model = openNLP.getPartOfSpeechModel("en");
Assert.assertNotNull(model);
POSTagger posTagger = openNLP.getPartOfSpeechTagger("en");
Assert.assertNotNull(posTagger);
}
use of opennlp.tools.postag.POSTagger in project stanbol by apache.
the class OpenNLPTest method testLoadMissingPOS.
@Test
public void testLoadMissingPOS() throws IOException {
POSModel model = openNLP.getPartOfSpeechModel("ru");
Assert.assertNull(model);
POSTagger posTagger = openNLP.getPartOfSpeechTagger("ru");
Assert.assertNull(posTagger);
}
Aggregations