use of com.yahoo.document.annotation.SpanTree in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatMaxTermOccurencesIsHonored.
@Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
// completely different from
final String stemmedInputTerm = "bar";
// inputTerm for safer test
final String paddedInputTerm = inputTerm + " ";
final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
}
for (TokenType type : TokenType.values()) {
if (!type.isIndexable()) {
continue;
}
StringBuilder input = new StringBuilder();
Token[] tokens = new Token[inputTermOccurence];
for (int i = 0; i < inputTermOccurence; ++i) {
SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
t.setOffset(i * paddedInputTerm.length());
tokens[i] = t;
input.append(paddedInputTerm);
}
assertAnnotations(expected, input.toString(), tokens);
}
}
use of com.yahoo.document.annotation.SpanTree in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatTermReplacementsAreApplied.
@Test
public void requireThatTermReplacementsAreApplied() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
for (boolean specialToken : Arrays.asList(true, false)) {
for (TokenType type : TokenType.values()) {
if (!specialToken && !type.isIndexable()) {
continue;
}
assertAnnotations(expected, "foo", newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)), Collections.singletonMap("foo", "bar")));
}
}
}
use of com.yahoo.document.annotation.SpanTree in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType.
@Test
public void requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
for (TokenType type : TokenType.values()) {
assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
}
}
use of com.yahoo.document.annotation.SpanTree in project vespa by vespa-engine.
the class LinguisticsAnnotatorTestCase method requireThatTokenizeCappingWorks.
@Test
public void requireThatTokenizeCappingWorks() {
String shortString = "short string";
SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
spanTree.setStringFieldValue(new StringFieldValue(shortString));
spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM));
spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM));
StringFieldValue shortValue = new StringFieldValue(shortString);
Linguistics linguistics = new SimpleLinguistics();
LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12));
assertTrue(annotator.annotate(shortValue));
assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS));
assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string");
assertTrue(annotator.annotate(cappedValue));
assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
}
use of com.yahoo.document.annotation.SpanTree in project vespa by vespa-engine.
the class VespaDocumentDeserializer42 method read.
public void read(FieldBase field, StringFieldValue value) {
byte coding = getByte(null);
int length = getInt1_4Bytes(null);
// OK, it seems that this length includes null termination.
// NOTE: the following four lines are basically parseNullTerminatedString() inlined,
// but we need to use the UTF-8 buffer below, so not using that method...
byte[] stringArray = new byte[length - 1];
buf.get(stringArray);
// move past 0-termination
buf.get();
value.setUnChecked(Utf8.toString(stringArray));
if ((coding & 64) == 64) {
// we have a span tree!
try {
// we don't support serialization of nested span trees, so this is safe:
stringPositions = calculateStringPositions(stringArray);
// total length:
int size = buf.getInt();
int startPos = buf.position();
int numSpanTrees = buf.getInt1_2_4Bytes();
for (int i = 0; i < numSpanTrees; i++) {
SpanTree tree = new SpanTree();
StringFieldValue treeName = new StringFieldValue();
treeName.deserialize(this);
tree.setName(treeName.getString());
value.setSpanTree(tree);
readSpanTree(tree, false);
}
buf.position(startPos + size);
} finally {
stringPositions = null;
}
}
}
Aggregations