Search in sources :

Example 1 with Tokenizer

use of com.yahoo.language.process.Tokenizer in project vespa by vespa-engine.

the class LinguisticsAnnotator method annotate.

/**
 * Annotates the given string with the appropriate linguistics annotations.
 *
 * @param text the text to annotate
 * @return whether or not anything was annotated
 */
public boolean annotate(StringFieldValue text) {
    // Already annotated with LINGUISTICS.
    if (text.getSpanTree(SpanTrees.LINGUISTICS) != null)
        return true;
    Tokenizer tokenizer = factory.getTokenizer();
    String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength());
    Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents());
    TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
    SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
    for (Token token : tokens) {
        addAnnotationSpan(text.getString(), tree.spanList(), tokenizer, token, config.getStemMode(), termOccurrences);
    }
    if (tree.numAnnotations() == 0)
        return false;
    text.setSpanTree(tree);
    return true;
}
Also used : Token(com.yahoo.language.process.Token) Tokenizer(com.yahoo.language.process.Tokenizer) SpanTree(com.yahoo.document.annotation.SpanTree)

Aggregations

SpanTree (com.yahoo.document.annotation.SpanTree)1 Token (com.yahoo.language.process.Token)1 Tokenizer (com.yahoo.language.process.Tokenizer)1