Search in sources :

Example 1 with Token

use of com.yahoo.language.process.Token in project vespa by vespa-engine.

the class LinguisticsAnnotator method annotate.

/**
 * Annotates the given string with the appropriate linguistics annotations.
 *
 * @param text the text to annotate
 * @return whether or not anything was annotated
 */
public boolean annotate(StringFieldValue text) {
    // Already annotated with LINGUISTICS.
    if (text.getSpanTree(SpanTrees.LINGUISTICS) != null)
        return true;
    Tokenizer tokenizer = factory.getTokenizer();
    String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength());
    Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents());
    TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
    SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
    for (Token token : tokens) {
        addAnnotationSpan(text.getString(), tree.spanList(), tokenizer, token, config.getStemMode(), termOccurrences);
    }
    if (tree.numAnnotations() == 0)
        return false;
    text.setSpanTree(tree);
    return true;
}
Also used : Token(com.yahoo.language.process.Token) Tokenizer(com.yahoo.language.process.Tokenizer) SpanTree(com.yahoo.document.annotation.SpanTree)

Example 2 with Token

use of com.yahoo.language.process.Token in project vespa by vespa-engine.

the class SimpleToken method toString.

private static String toString(Token token, String indent) {
    StringBuilder builder = new StringBuilder();
    builder.append(indent).append("components : {\n");
    for (int i = 0, len = token.getNumComponents(); i < len; ++i) {
        Token comp = token.getComponent(i);
        builder.append(indent).append("    [").append(i).append("] : ").append(comp.getClass().getSimpleName());
        builder.append(" {\n").append(toString(comp, indent + "        "));
        builder.append(indent).append("    }\n");
    }
    builder.append(indent).append("}\n");
    builder.append(indent).append("offset : ").append(token.getOffset()).append("\n");
    builder.append(indent).append("orig : ").append(quoteString(token.getOrig())).append("\n");
    builder.append(indent).append("script : ").append(token.getScript()).append("\n");
    builder.append(indent).append("special : ").append(token.isSpecialToken()).append("\n");
    builder.append(indent).append("token string : ").append(quoteString(token.getTokenString())).append("\n");
    builder.append(indent).append("type : ").append(token.getType()).append("\n");
    return builder.toString();
}
Also used : Token(com.yahoo.language.process.Token)

Example 3 with Token

use of com.yahoo.language.process.Token in project vespa by vespa-engine.

the class QueryTestCase method testSimpleFunctionality.

@Test
public void testSimpleFunctionality() {
    Query q = new Query(QueryTestCase.httpEncode("/sdfsd.html?query=this is a simple query&aParameter"));
    assertEquals("this is a simple query", q.getModel().getQueryString());
    assertNotNull(q.getModel().getQueryTree());
    assertNull(q.getModel().getDefaultIndex());
    assertEquals("", q.properties().get("aParameter"));
    assertNull(q.properties().get("notSetParameter"));
    Query query = q;
    String body = "a bb. ccc??!";
    Linguistics linguistics = new SimpleLinguistics();
    AndItem and = new AndItem();
    for (Token token : linguistics.getTokenizer().tokenize(body, Language.ENGLISH, StemMode.SHORTEST, true)) {
        if (token.isIndexable())
            and.addItem(new WordItem(token.getTokenString(), "body"));
    }
    query.getModel().getQueryTree().setRoot(and);
    System.out.println(query);
}
Also used : SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) Query(com.yahoo.search.Query) AndItem(com.yahoo.prelude.query.AndItem) Linguistics(com.yahoo.language.Linguistics) SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) Token(com.yahoo.language.process.Token) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) WordItem(com.yahoo.prelude.query.WordItem) Test(org.junit.Test)

Example 4 with Token

use of com.yahoo.language.process.Token in project vespa by vespa-engine.

the class LinguisticsAnnotatorTestCase method requireThatMaxTermOccurencesIsHonored.

@Test
public void requireThatMaxTermOccurencesIsHonored() {
    final String inputTerm = "foo";
    // completely different from
    final String stemmedInputTerm = "bar";
    // inputTerm for safer test
    final String paddedInputTerm = inputTerm + " ";
    final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
    final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
    for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
        expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
    }
    for (TokenType type : TokenType.values()) {
        if (!type.isIndexable()) {
            continue;
        }
        StringBuilder input = new StringBuilder();
        Token[] tokens = new Token[inputTermOccurence];
        for (int i = 0; i < inputTermOccurence; ++i) {
            SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
            t.setOffset(i * paddedInputTerm.length());
            tokens[i] = t;
            input.append(paddedInputTerm);
        }
        assertAnnotations(expected, input.toString(), tokens);
    }
}
Also used : TokenType(com.yahoo.language.process.TokenType) StringFieldValue(com.yahoo.document.datatypes.StringFieldValue) SimpleToken(com.yahoo.language.simple.SimpleToken) Token(com.yahoo.language.process.Token) SimpleToken(com.yahoo.language.simple.SimpleToken) Annotation(com.yahoo.document.annotation.Annotation) SpanTree(com.yahoo.document.annotation.SpanTree) Test(org.junit.Test)

Example 5 with Token

use of com.yahoo.language.process.Token in project vespa by vespa-engine.

the class TokenizerTester method assertTokens.

public void assertTokens(String input, String... expectedTokenStrings) {
    List<String> actual = new ArrayList<>();
    for (Token token : tokenize(input)) {
        findTokenStrings(token, actual);
    }
    assertEquals(Arrays.asList(expectedTokenStrings), actual);
}
Also used : ArrayList(java.util.ArrayList) Token(com.yahoo.language.process.Token)

Aggregations

Token (com.yahoo.language.process.Token)5 SpanTree (com.yahoo.document.annotation.SpanTree)2 Test (org.junit.Test)2 Annotation (com.yahoo.document.annotation.Annotation)1 StringFieldValue (com.yahoo.document.datatypes.StringFieldValue)1 Linguistics (com.yahoo.language.Linguistics)1 TokenType (com.yahoo.language.process.TokenType)1 Tokenizer (com.yahoo.language.process.Tokenizer)1 SimpleLinguistics (com.yahoo.language.simple.SimpleLinguistics)1 SimpleToken (com.yahoo.language.simple.SimpleToken)1 AndItem (com.yahoo.prelude.query.AndItem)1 WordItem (com.yahoo.prelude.query.WordItem)1 Query (com.yahoo.search.Query)1 ArrayList (java.util.ArrayList)1 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)1