use of com.yahoo.language.process.Tokenizer in project vespa by vespa-engine.
the class LinguisticsAnnotator method annotate.
/**
* Annotates the given string with the appropriate linguistics annotations.
*
* @param text the text to annotate
* @return whether or not anything was annotated
*/
public boolean annotate(StringFieldValue text) {
// Already annotated with LINGUISTICS.
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null)
return true;
Tokenizer tokenizer = factory.getTokenizer();
String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens) {
addAnnotationSpan(text.getString(), tree.spanList(), tokenizer, token, config.getStemMode(), termOccurrences);
}
if (tree.numAnnotations() == 0)
return false;
text.setSpanTree(tree);
return true;
}