Search in sources :

Example 6 with Tokenizer

use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.

the class TokenizerTestCase method testPlainTokenization.

@Test
public void testPlainTokenization() {
    Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
    tokenizer.setSpecialTokens(createSpecialTokens());
    List<?> tokens = tokenizer.tokenize("drive (to hwy88, 88) +or language:en ugcapi_1");
    assertEquals(new Token(WORD, "drive"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(LBRACE, "("), tokens.get(2));
    assertEquals(new Token(WORD, "to"), tokens.get(3));
    assertEquals(new Token(SPACE, " "), tokens.get(4));
    assertEquals(new Token(WORD, "hwy88"), tokens.get(5));
    assertEquals(new Token(COMMA, ","), tokens.get(6));
    assertEquals(new Token(SPACE, " "), tokens.get(7));
    assertEquals(new Token(NUMBER, "88"), tokens.get(8));
    assertEquals(new Token(RBRACE, ")"), tokens.get(9));
    assertEquals(new Token(SPACE, " "), tokens.get(10));
    assertEquals(new Token(PLUS, "+"), tokens.get(11));
    assertEquals(new Token(WORD, "or"), tokens.get(12));
    assertEquals(new Token(SPACE, " "), tokens.get(13));
    assertEquals(new Token(WORD, "language"), tokens.get(14));
    assertEquals(new Token(COLON, ":"), tokens.get(15));
    assertEquals(new Token(WORD, "en"), tokens.get(16));
    assertEquals(new Token(SPACE, " "), tokens.get(17));
    assertEquals(new Token(WORD, "ugcapi"), tokens.get(18));
    assertEquals(new Token(UNDERSCORE, "_"), tokens.get(19));
    assertEquals(new Token(NUMBER, "1"), tokens.get(20));
}
Also used : SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) Token(com.yahoo.prelude.query.parser.Token) Tokenizer(com.yahoo.prelude.query.parser.Tokenizer) Test(org.junit.Test)

Example 7 with Tokenizer

use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.

the class TokenizerTestCase method testExactMatchTokenizationWithTerminatorTerminatedByEndOfString.

@Test
public void testExactMatchTokenizationWithTerminatorTerminatedByEndOfString() {
    Index index1 = new Index("testexact1");
    index1.setExact(true, null);
    Index index2 = new Index("testexact2");
    index2.setExact(true, "()/aa*::*&");
    IndexFacts facts = new IndexFacts();
    facts.addIndex("testsd", index1);
    facts.addIndex("testsd", index2);
    Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
    IndexFacts.Session session = facts.newSession(Collections.emptySet(), Collections.emptySet());
    List<?> tokens = tokenizer.tokenize("normal a:b (normal testexact1:/,%#%&+-+ ) testexact2:ho_/&%&/()/aa*::*", session);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "a"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "b"), tokens.get(4));
    assertEquals(new Token(SPACE, " "), tokens.get(5));
    assertEquals(new Token(LBRACE, "("), tokens.get(6));
    assertEquals(new Token(WORD, "normal"), tokens.get(7));
    assertEquals(new Token(SPACE, " "), tokens.get(8));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(9));
    assertEquals(new Token(COLON, ":"), tokens.get(10));
    assertEquals(new Token(WORD, "/,%#%&+-+"), tokens.get(11));
    assertEquals(new Token(SPACE, " "), tokens.get(12));
    assertEquals(new Token(RBRACE, ")"), tokens.get(13));
    assertEquals(new Token(SPACE, " "), tokens.get(14));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(15));
    assertEquals(new Token(COLON, ":"), tokens.get(16));
    assertEquals(new Token(WORD, "ho_/&%&/()/aa*::*"), tokens.get(17));
    assertTrue(((Token) tokens.get(17)).isSpecial());
}
Also used : SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) IndexFacts(com.yahoo.prelude.IndexFacts) Index(com.yahoo.prelude.Index) Token(com.yahoo.prelude.query.parser.Token) Tokenizer(com.yahoo.prelude.query.parser.Tokenizer) Test(org.junit.Test)

Example 8 with Tokenizer

use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.

the class TokenizerTestCase method testSpecialTokenCJK.

/**
 * In cjk languages, special tokens must be recognized as substrings of strings not
 * separated by space, as special token recognition happens before tokenization
 */
@Test
public void testSpecialTokenCJK() {
    assertEquals("Special tokens configured", 6, defaultRegistry.getSpecialTokens("default").size());
    Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
    tokenizer.setSubstringSpecialTokens(true);
    tokenizer.setSpecialTokens(defaultRegistry.getSpecialTokens("default"));
    List<?> tokens = tokenizer.tokenize("fooc#bar,c++with spacebarknowknowknow,knowknownot know");
    assertEquals(new Token(WORD, "foo"), tokens.get(0));
    assertEquals(new Token(WORD, "c#"), tokens.get(1));
    assertEquals(new Token(WORD, "bar"), tokens.get(2));
    assertEquals(new Token(COMMA, ","), tokens.get(3));
    assertEquals(new Token(WORD, "cpp"), tokens.get(4));
    assertEquals(new Token(WORD, "with-space"), tokens.get(5));
    assertEquals(new Token(WORD, "bar"), tokens.get(6));
    assertEquals(new Token(WORD, "knuwww"), tokens.get(7));
    assertEquals(new Token(WORD, "knuwww"), tokens.get(8));
    assertEquals(new Token(WORD, "knuwww"), tokens.get(9));
    assertEquals(new Token(COMMA, ","), tokens.get(10));
    assertEquals(new Token(WORD, "knuwww"), tokens.get(11));
    assertEquals(new Token(WORD, "knuwww"), tokens.get(12));
    assertEquals(new Token(WORD, "not"), tokens.get(13));
    assertEquals(new Token(SPACE, " "), tokens.get(14));
    assertEquals(new Token(WORD, "knuwww"), tokens.get(15));
}
Also used : SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) Token(com.yahoo.prelude.query.parser.Token) Tokenizer(com.yahoo.prelude.query.parser.Tokenizer) Test(org.junit.Test)

Example 9 with Tokenizer

use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.

the class TokenizerTestCase method testExactMatchHeuristics.

@Test
public void testExactMatchHeuristics() {
    Index index1 = new Index("testexact1");
    index1.setExact(true, null);
    Index index2 = new Index("testexact2");
    index2.setExact(true, "()/aa*::*&");
    IndexFacts indexFacts = new IndexFacts();
    indexFacts.addIndex("testsd", index1);
    indexFacts.addIndex("testsd", index2);
    IndexFacts.Session facts = indexFacts.newSession(Collections.emptySet(), Collections.emptySet());
    Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
    List<?> tokens = tokenizer.tokenize("normal a:b (normal testexact1:foo) testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "a"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "b"), tokens.get(4));
    assertEquals(new Token(SPACE, " "), tokens.get(5));
    assertEquals(new Token(LBRACE, "("), tokens.get(6));
    assertEquals(new Token(WORD, "normal"), tokens.get(7));
    assertEquals(new Token(SPACE, " "), tokens.get(8));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(9));
    assertEquals(new Token(COLON, ":"), tokens.get(10));
    assertEquals(new Token(WORD, "foo"), tokens.get(11));
    assertEquals(new Token(RBRACE, ")"), tokens.get(12));
    assertEquals(new Token(SPACE, " "), tokens.get(13));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(14));
    assertEquals(new Token(COLON, ":"), tokens.get(15));
    assertEquals(new Token(WORD, "bar"), tokens.get(16));
    tokens = tokenizer.tokenize("testexact1:a*teens", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "a*teens"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1:foo\"bar", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo\"bar"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1:foo!bar", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo!bar"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1:foo! ", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(3));
    assertEquals(new Token(SPACE, " "), tokens.get(4));
    tokens = tokenizer.tokenize("testexact1:foo!! ", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(3));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(4));
    assertEquals(new Token(SPACE, " "), tokens.get(5));
    tokens = tokenizer.tokenize("testexact1:foo!100 ", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(3));
    assertEquals(new Token(NUMBER, "100"), tokens.get(4));
    assertEquals(new Token(SPACE, " "), tokens.get(5));
    tokens = tokenizer.tokenize("testexact1:foo*!100 ", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    assertEquals(new Token(STAR, "*"), tokens.get(3));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(4));
    assertEquals(new Token(NUMBER, "100"), tokens.get(5));
    assertEquals(new Token(SPACE, " "), tokens.get(6));
    tokens = tokenizer.tokenize("testexact1: *\"foo bar\"*!100 ", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(STAR, "*"), tokens.get(2));
    assertEquals(new Token(WORD, "foo bar"), tokens.get(3));
    assertEquals(new Token(STAR, "*"), tokens.get(4));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(5));
    assertEquals(new Token(NUMBER, "100"), tokens.get(6));
    assertEquals(new Token(SPACE, " "), tokens.get(7));
    tokens = tokenizer.tokenize("testexact1: *\"foo bar\"*!100", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(STAR, "*"), tokens.get(2));
    assertEquals(new Token(WORD, "foo bar"), tokens.get(3));
    assertEquals(new Token(STAR, "*"), tokens.get(4));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(5));
    assertEquals(new Token(NUMBER, "100"), tokens.get(6));
    tokens = tokenizer.tokenize("testexact1: *foobar*!100", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(STAR, "*"), tokens.get(2));
    assertEquals(new Token(WORD, "foobar"), tokens.get(3));
    assertEquals(new Token(STAR, "*"), tokens.get(4));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(5));
    assertEquals(new Token(NUMBER, "100"), tokens.get(6));
    tokens = tokenizer.tokenize("testexact1: *foobar*!100!", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(STAR, "*"), tokens.get(2));
    assertEquals(new Token(WORD, "foobar*!100"), tokens.get(3));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(4));
    tokens = tokenizer.tokenize("testexact1:foo(bar)", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo(bar)"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1:\"foo\"", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1: foo", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1: \"foo\"", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1: \"foo\"", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "foo"), tokens.get(2));
    tokens = tokenizer.tokenize("testexact1:vespa testexact2:resolved", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "vespa"), tokens.get(2));
    assertEquals(new Token(SPACE, " "), tokens.get(3));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(4));
    assertEquals(new Token(COLON, ":"), tokens.get(5));
    assertEquals(new Token(WORD, "resolved"), tokens.get(6));
    tokens = tokenizer.tokenize("testexact1:\"news search\" testexact2:resolved", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "news search"), tokens.get(2));
    assertEquals(new Token(SPACE, " "), tokens.get(3));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(4));
    assertEquals(new Token(COLON, ":"), tokens.get(5));
    assertEquals(new Token(WORD, "resolved"), tokens.get(6));
    tokens = tokenizer.tokenize("(testexact1:\"news search\" testexact1:vespa)", facts);
    assertEquals(new Token(LBRACE, "("), tokens.get(0));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(1));
    assertEquals(new Token(COLON, ":"), tokens.get(2));
    assertEquals(new Token(WORD, "news search"), tokens.get(3));
    assertEquals(new Token(SPACE, " "), tokens.get(4));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(5));
    assertEquals(new Token(COLON, ":"), tokens.get(6));
    assertEquals(new Token(WORD, "vespa"), tokens.get(7));
    assertEquals(new Token(RBRACE, ")"), tokens.get(8));
    tokens = tokenizer.tokenize("testexact1:news*", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "news"), tokens.get(2));
    assertEquals(new Token(STAR, "*"), tokens.get(3));
    tokens = tokenizer.tokenize("testexact1:\"news\"*", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "news"), tokens.get(2));
    assertEquals(new Token(STAR, "*"), tokens.get(3));
    tokens = tokenizer.tokenize("testexact1:\"news search\"!200", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "news search"), tokens.get(2));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(3));
    assertEquals(new Token(NUMBER, "200"), tokens.get(4));
    tokens = tokenizer.tokenize("testexact1:vespa!200", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(WORD, "vespa"), tokens.get(2));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(3));
    assertEquals(new Token(NUMBER, "200"), tokens.get(4));
    tokens = tokenizer.tokenize("testexact1:*\"news\"*", facts);
    assertEquals(new Token(WORD, "testexact1"), tokens.get(0));
    assertEquals(new Token(COLON, ":"), tokens.get(1));
    assertEquals(new Token(STAR, "*"), tokens.get(2));
    assertEquals(new Token(WORD, "news"), tokens.get(3));
    assertEquals(new Token(STAR, "*"), tokens.get(4));
    tokens = tokenizer.tokenize("normal(testexact1:foo) testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(LBRACE, "("), tokens.get(1));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "foo"), tokens.get(4));
    assertEquals(new Token(RBRACE, ")"), tokens.get(5));
    assertEquals(new Token(SPACE, " "), tokens.get(6));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(7));
    assertEquals(new Token(COLON, ":"), tokens.get(8));
    assertEquals(new Token(WORD, "bar"), tokens.get(9));
    tokens = tokenizer.tokenize("normal testexact1:(foo testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "(foo"), tokens.get(4));
    assertEquals(new Token(SPACE, " "), tokens.get(5));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(6));
    assertEquals(new Token(COLON, ":"), tokens.get(7));
    assertEquals(new Token(WORD, "bar"), tokens.get(8));
    tokens = tokenizer.tokenize("normal testexact1:foo! testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "foo"), tokens.get(4));
    assertEquals(new Token(EXCLAMATION, "!"), tokens.get(5));
    assertEquals(new Token(SPACE, " "), tokens.get(6));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(7));
    assertEquals(new Token(COLON, ":"), tokens.get(8));
    assertEquals(new Token(WORD, "bar"), tokens.get(9));
    tokens = tokenizer.tokenize("normal testexact1:foo* testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "foo"), tokens.get(4));
    assertEquals(new Token(STAR, "*"), tokens.get(5));
    assertEquals(new Token(SPACE, " "), tokens.get(6));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(7));
    assertEquals(new Token(COLON, ":"), tokens.get(8));
    assertEquals(new Token(WORD, "bar"), tokens.get(9));
    tokens = tokenizer.tokenize("normal testexact1: foo* testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, "foo"), tokens.get(4));
    assertEquals(new Token(STAR, "*"), tokens.get(5));
    assertEquals(new Token(SPACE, " "), tokens.get(6));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(7));
    assertEquals(new Token(COLON, ":"), tokens.get(8));
    assertEquals(new Token(WORD, "bar"), tokens.get(9));
    tokens = tokenizer.tokenize("normal testexact1:\" foo\"* testexact2:bar", facts);
    assertEquals(new Token(WORD, "normal"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "testexact1"), tokens.get(2));
    assertEquals(new Token(COLON, ":"), tokens.get(3));
    assertEquals(new Token(WORD, " foo"), tokens.get(4));
    assertEquals(new Token(STAR, "*"), tokens.get(5));
    assertEquals(new Token(SPACE, " "), tokens.get(6));
    assertEquals(new Token(WORD, "testexact2"), tokens.get(7));
    assertEquals(new Token(COLON, ":"), tokens.get(8));
    assertEquals(new Token(WORD, "bar"), tokens.get(9));
}
Also used : SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) IndexFacts(com.yahoo.prelude.IndexFacts) Index(com.yahoo.prelude.Index) Token(com.yahoo.prelude.query.parser.Token) Tokenizer(com.yahoo.prelude.query.parser.Tokenizer) Test(org.junit.Test)

Example 10 with Tokenizer

use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.

the class TokenizerTestCase method testSpecialTokenCaseInsensitive.

@Test
public void testSpecialTokenCaseInsensitive() {
    Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
    tokenizer.setSpecialTokens(createSpecialTokens());
    List<?> tokens = tokenizer.tokenize("The AS/400 is great");
    assertEquals(new Token(WORD, "The"), tokens.get(0));
    assertEquals(new Token(SPACE, " "), tokens.get(1));
    assertEquals(new Token(WORD, "as/400"), tokens.get(2));
    assertEquals(new Token(SPACE, " "), tokens.get(3));
    assertEquals(new Token(WORD, "is"), tokens.get(4));
    assertEquals(new Token(SPACE, " "), tokens.get(5));
    assertEquals(new Token(WORD, "great"), tokens.get(6));
}
Also used : SimpleLinguistics(com.yahoo.language.simple.SimpleLinguistics) Token(com.yahoo.prelude.query.parser.Token) Tokenizer(com.yahoo.prelude.query.parser.Tokenizer) Test(org.junit.Test)

Aggregations

SimpleLinguistics (com.yahoo.language.simple.SimpleLinguistics)17 Token (com.yahoo.prelude.query.parser.Token)17 Tokenizer (com.yahoo.prelude.query.parser.Tokenizer)17 Test (org.junit.Test)17 Index (com.yahoo.prelude.Index)5 IndexFacts (com.yahoo.prelude.IndexFacts)5 SpecialTokenRegistry (com.yahoo.prelude.query.parser.SpecialTokenRegistry)3