use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.
the class TokenizerTestCase method testOutsideBMPCodepoints.
@Test
public void testOutsideBMPCodepoints() {
Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
List<?> tokens = tokenizer.tokenize("\ud841\udd47");
assertEquals(new Token(WORD, "\ud841\udd47"), tokens.get(0));
}
use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.
the class TokenizerTestCase method testOneSpecialToken.
@Test
public void testOneSpecialToken() {
Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
tokenizer.setSpecialTokens(createSpecialTokens());
List<?> tokens = tokenizer.tokenize("c++ lovers, please apply");
assertEquals(new Token(WORD, "c++"), tokens.get(0));
}
use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.
the class TokenizerTestCase method testSingleQuoteAsWordCharacter.
@Test
public void testSingleQuoteAsWordCharacter() {
Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
tokenizer.setSpecialTokens(createSpecialTokens());
List<?> tokens = tokenizer.tokenize("drive (to hwy88, 88) +or language:en nalle:a'a ugcapi_1 'a' 'a a'");
assertEquals(new Token(WORD, "drive"), tokens.get(0));
assertEquals(new Token(SPACE, " "), tokens.get(1));
assertEquals(new Token(LBRACE, "("), tokens.get(2));
assertEquals(new Token(WORD, "to"), tokens.get(3));
assertEquals(new Token(SPACE, " "), tokens.get(4));
assertEquals(new Token(WORD, "hwy88"), tokens.get(5));
assertEquals(new Token(COMMA, ","), tokens.get(6));
assertEquals(new Token(SPACE, " "), tokens.get(7));
assertEquals(new Token(NUMBER, "88"), tokens.get(8));
assertEquals(new Token(RBRACE, ")"), tokens.get(9));
assertEquals(new Token(SPACE, " "), tokens.get(10));
assertEquals(new Token(PLUS, "+"), tokens.get(11));
assertEquals(new Token(WORD, "or"), tokens.get(12));
assertEquals(new Token(SPACE, " "), tokens.get(13));
assertEquals(new Token(WORD, "language"), tokens.get(14));
assertEquals(new Token(COLON, ":"), tokens.get(15));
assertEquals(new Token(WORD, "en"), tokens.get(16));
assertEquals(new Token(SPACE, " "), tokens.get(17));
assertEquals(new Token(WORD, "nalle"), tokens.get(18));
assertEquals(new Token(COLON, ":"), tokens.get(19));
assertEquals(new Token(WORD, "a'a"), tokens.get(20));
assertEquals(new Token(SPACE, " "), tokens.get(21));
assertEquals(new Token(WORD, "ugcapi"), tokens.get(22));
assertEquals(new Token(UNDERSCORE, "_"), tokens.get(23));
assertEquals(new Token(NUMBER, "1"), tokens.get(24));
assertEquals(new Token(SPACE, " "), tokens.get(25));
assertEquals(new Token(WORD, "'a'"), tokens.get(26));
assertEquals(new Token(SPACE, " "), tokens.get(27));
assertEquals(new Token(WORD, "'a"), tokens.get(28));
assertEquals(new Token(SPACE, " "), tokens.get(29));
assertEquals(new Token(WORD, "a'"), tokens.get(30));
}
use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.
the class TokenizerTestCase method testExactMatchTokenizationEndsByColon.
@Test
public void testExactMatchTokenizationEndsByColon() {
Index index1 = new Index("testexact1");
index1.setExact(true, null);
Index index2 = new Index("testexact2");
index2.setExact(true, "()/aa*::*&");
IndexFacts facts = new IndexFacts();
facts.addIndex("testsd", index1);
facts.addIndex("testsd", index2);
Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
IndexFacts.Session session = facts.newSession(Collections.emptySet(), Collections.emptySet());
List<?> tokens = tokenizer.tokenize("normal a:b (normal testexact1:!/%#%&+-+ ) testexact2:ho_/&%&/()/aa*::*&b:", session);
assertEquals(new Token(WORD, "normal"), tokens.get(0));
assertEquals(new Token(SPACE, " "), tokens.get(1));
assertEquals(new Token(WORD, "a"), tokens.get(2));
assertEquals(new Token(COLON, ":"), tokens.get(3));
assertEquals(new Token(WORD, "b"), tokens.get(4));
assertEquals(new Token(SPACE, " "), tokens.get(5));
assertEquals(new Token(LBRACE, "("), tokens.get(6));
assertEquals(new Token(WORD, "normal"), tokens.get(7));
assertEquals(new Token(SPACE, " "), tokens.get(8));
assertEquals(new Token(WORD, "testexact1"), tokens.get(9));
assertEquals(new Token(COLON, ":"), tokens.get(10));
assertEquals(new Token(WORD, "!/%#%&+-+"), tokens.get(11));
assertEquals(new Token(SPACE, " "), tokens.get(12));
assertEquals(new Token(RBRACE, ")"), tokens.get(13));
assertEquals(new Token(SPACE, " "), tokens.get(14));
assertEquals(new Token(WORD, "testexact2"), tokens.get(15));
assertEquals(new Token(COLON, ":"), tokens.get(16));
assertEquals(new Token(WORD, "ho_/&%&/"), tokens.get(17));
assertEquals(new Token(WORD, "b"), tokens.get(18));
assertEquals(new Token(COLON, ":"), tokens.get(19));
}
use of com.yahoo.prelude.query.parser.Tokenizer in project vespa by vespa-engine.
the class TokenizerTestCase method testTokenReplacing.
@Test
public void testTokenReplacing() {
assertEquals("Special tokens configured", 6, defaultRegistry.getSpecialTokens("default").size());
Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics());
tokenizer.setSpecialTokens(defaultRegistry.getSpecialTokens("default"));
List<?> tokens = tokenizer.tokenize("with space, c++ or .... know, not b.s.d.");
assertEquals(new Token(WORD, "with-space"), tokens.get(0));
assertEquals(new Token(COMMA, ","), tokens.get(1));
assertEquals(new Token(SPACE, " "), tokens.get(2));
assertEquals(new Token(WORD, "cpp"), tokens.get(3));
assertEquals(new Token(SPACE, " "), tokens.get(4));
assertEquals(new Token(WORD, "or"), tokens.get(5));
assertEquals(new Token(SPACE, " "), tokens.get(6));
assertEquals(new Token(WORD, "...."), tokens.get(7));
assertEquals(new Token(SPACE, " "), tokens.get(8));
assertEquals(new Token(WORD, "knuwww"), tokens.get(9));
assertEquals(new Token(COMMA, ","), tokens.get(10));
assertEquals(new Token(SPACE, " "), tokens.get(11));
assertEquals(new Token(WORD, "not"), tokens.get(12));
assertEquals(new Token(SPACE, " "), tokens.get(13));
assertEquals(new Token(WORD, "b.s.d."), tokens.get(14));
assertTrue(((Token) tokens.get(9)).isSpecial());
assertFalse(((Token) tokens.get(12)).isSpecial());
}
Aggregations