use of com.yahoo.language.simple.SimpleToken in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatCompositeTokensAreFlattened.
@Test
public void requireThatCompositeTokensAreFlattened() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foo")));
expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)).addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3).addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)).addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
assertAnnotations(expected, "foobarbaz", token);
}
use of com.yahoo.language.simple.SimpleToken in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatCompositeSpecialTokensAreNotFlattened.
@Test
public void requireThatCompositeSpecialTokensAreNotFlattened() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foobarbaz")));
SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true).addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)).addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3).addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)).addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
assertAnnotations(expected, "foobarbaz", token);
}
use of com.yahoo.language.simple.SimpleToken in project vespa by vespa-engine.
the class StemmerImplTestCase method requireThatSpecialTokensAreNotDecompounded.
@Test
public void requireThatSpecialTokensAreNotDecompounded() {
SimpleToken token = new SimpleToken("c++").setType(TokenType.ALPHABETIC).setTokenString("c++").addComponent(new SimpleToken("c").setType(TokenType.ALPHABETIC).setTokenString("c")).addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC).setTokenString("p")).addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC).setTokenString("p"));
Tokenizer tokenizer = Mockito.mock(Tokenizer.class);
Mockito.when(tokenizer.tokenize(Mockito.anyString(), Mockito.<Language>any(), Mockito.<StemMode>any(), Mockito.anyBoolean())).thenReturn(Arrays.<Token>asList(token));
Stemmer stemmer = new StemmerImpl(tokenizer);
token.setSpecialToken(false);
assertEquals(Arrays.asList(new StemList("c"), new StemList("p"), new StemList("p")), stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
token.setSpecialToken(true);
assertEquals(Arrays.asList(new StemList("c++")), stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
}
use of com.yahoo.language.simple.SimpleToken in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatMaxTermOccurencesIsHonored.
@Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
// completely different from
final String stemmedInputTerm = "bar";
// inputTerm for safer test
final String paddedInputTerm = inputTerm + " ";
final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
}
for (TokenType type : TokenType.values()) {
if (!type.isIndexable()) {
continue;
}
StringBuilder input = new StringBuilder();
Token[] tokens = new Token[inputTermOccurence];
for (int i = 0; i < inputTermOccurence; ++i) {
SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
t.setOffset(i * paddedInputTerm.length());
tokens[i] = t;
input.append(paddedInputTerm);
}
assertAnnotations(expected, input.toString(), tokens);
}
}
Aggregations