use of com.yahoo.language.process.Token in project vespa by vespa-engine.
the class LinguisticsAnnotator method annotate.
/**
* Annotates the given string with the appropriate linguistics annotations.
*
* @param text the text to annotate
* @return whether or not anything was annotated
*/
public boolean annotate(StringFieldValue text) {
// Already annotated with LINGUISTICS.
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null)
return true;
Tokenizer tokenizer = factory.getTokenizer();
String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens) {
addAnnotationSpan(text.getString(), tree.spanList(), tokenizer, token, config.getStemMode(), termOccurrences);
}
if (tree.numAnnotations() == 0)
return false;
text.setSpanTree(tree);
return true;
}
use of com.yahoo.language.process.Token in project vespa by vespa-engine.
the class SimpleToken method toString.
private static String toString(Token token, String indent) {
StringBuilder builder = new StringBuilder();
builder.append(indent).append("components : {\n");
for (int i = 0, len = token.getNumComponents(); i < len; ++i) {
Token comp = token.getComponent(i);
builder.append(indent).append(" [").append(i).append("] : ").append(comp.getClass().getSimpleName());
builder.append(" {\n").append(toString(comp, indent + " "));
builder.append(indent).append(" }\n");
}
builder.append(indent).append("}\n");
builder.append(indent).append("offset : ").append(token.getOffset()).append("\n");
builder.append(indent).append("orig : ").append(quoteString(token.getOrig())).append("\n");
builder.append(indent).append("script : ").append(token.getScript()).append("\n");
builder.append(indent).append("special : ").append(token.isSpecialToken()).append("\n");
builder.append(indent).append("token string : ").append(quoteString(token.getTokenString())).append("\n");
builder.append(indent).append("type : ").append(token.getType()).append("\n");
return builder.toString();
}
use of com.yahoo.language.process.Token in project vespa by vespa-engine.
the class QueryTestCase method testSimpleFunctionality.
@Test
public void testSimpleFunctionality() {
Query q = new Query(QueryTestCase.httpEncode("/sdfsd.html?query=this is a simple query&aParameter"));
assertEquals("this is a simple query", q.getModel().getQueryString());
assertNotNull(q.getModel().getQueryTree());
assertNull(q.getModel().getDefaultIndex());
assertEquals("", q.properties().get("aParameter"));
assertNull(q.properties().get("notSetParameter"));
Query query = q;
String body = "a bb. ccc??!";
Linguistics linguistics = new SimpleLinguistics();
AndItem and = new AndItem();
for (Token token : linguistics.getTokenizer().tokenize(body, Language.ENGLISH, StemMode.SHORTEST, true)) {
if (token.isIndexable())
and.addItem(new WordItem(token.getTokenString(), "body"));
}
query.getModel().getQueryTree().setRoot(and);
System.out.println(query);
}
use of com.yahoo.language.process.Token in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatMaxTermOccurencesIsHonored.
@Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
// completely different from
final String stemmedInputTerm = "bar";
// inputTerm for safer test
final String paddedInputTerm = inputTerm + " ";
final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
}
for (TokenType type : TokenType.values()) {
if (!type.isIndexable()) {
continue;
}
StringBuilder input = new StringBuilder();
Token[] tokens = new Token[inputTermOccurence];
for (int i = 0; i < inputTermOccurence; ++i) {
SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
t.setOffset(i * paddedInputTerm.length());
tokens[i] = t;
input.append(paddedInputTerm);
}
assertAnnotations(expected, input.toString(), tokens);
}
}
use of com.yahoo.language.process.Token in project vespa by vespa-engine.
the class TokenizerTester method assertTokens.
public void assertTokens(String input, String... expectedTokenStrings) {
List<String> actual = new ArrayList<>();
for (Token token : tokenize(input)) {
findTokenStrings(token, actual);
}
assertEquals(Arrays.asList(expectedTokenStrings), actual);
}
Aggregations