use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.
the class LinguisticsAnnotator method addAnnotationSpan.
private static void addAnnotationSpan(String input, SpanList parent, Tokenizer tokenizer, Token token, StemMode mode, TermOccurrences termOccurrences) {
if (!token.isSpecialToken()) {
if (token.getNumComponents() > 0) {
for (int i = 0; i < token.getNumComponents(); ++i) {
addAnnotationSpan(input, parent, tokenizer, token.getComponent(i), mode, termOccurrences);
}
return;
}
if (!token.isIndexable()) {
return;
}
}
String orig = token.getOrig();
int pos = (int) token.getOffset();
if (pos >= input.length()) {
throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which is outside the " + "bounds of the input string; " + input);
}
int len = orig.length();
if (pos + len > input.length()) {
throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which makes it overflow " + "the bounds of the input string; " + input);
}
if (mode == StemMode.ALL) {
Span where = parent.span(pos, len);
String lowercasedOrig = toLowerCase(orig);
addAnnotation(where, orig, orig, termOccurrences);
String lowercasedTerm = lowercasedOrig;
String term = token.getTokenString();
if (term != null) {
term = tokenizer.getReplacementTerm(term);
}
if (term != null) {
lowercasedTerm = toLowerCase(term);
}
if (!lowercasedOrig.equals(lowercasedTerm)) {
addAnnotation(where, term, orig, termOccurrences);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
String lowercasedStem = toLowerCase(stem);
if (!(lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
addAnnotation(where, stem, orig, termOccurrences);
}
}
} else {
String term = token.getTokenString();
if (term != null) {
term = tokenizer.getReplacementTerm(term);
}
if (term == null || term.trim().isEmpty()) {
return;
}
if (termOccurrences.termCountBelowLimit(term)) {
parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
}
}
}
use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.
the class StringTestCase method annotate.
public Document annotate(Document document, DocumentTypeManager manager) {
AnnotationTypeRegistry registry = manager.getAnnotationTypeRegistry();
AnnotationType company = registry.getType("company");
AnnotationType industry = registry.getType("industry");
AnnotationType person = registry.getType("person");
AnnotationType location = registry.getType("location");
Map<String, AnnotationType> m = registry.getTypes();
for (String key : m.keySet()) {
System.out.println("Key: " + key);
AnnotationType val = m.get(key);
parseAnnotationType(val);
}
SpanTree tree = new SpanTree("testannotations");
SpanList root = (SpanList) tree.getRoot();
SpanNode companySpan = new Span(0, 5);
SpanNode industrySpan = new Span(5, 10);
SpanNode personSpan = new Span(10, 15);
SpanNode locationSpan = new Span(15, 20);
root.add(companySpan);
root.add(industrySpan);
root.add(personSpan);
root.add(locationSpan);
Struct companyValue = (Struct) company.getDataType().createFieldValue();
companyValue.setFieldValue("name", new StringFieldValue("Sun"));
companyValue.setFieldValue("ceo", new StringFieldValue("Scott Mcnealy"));
companyValue.setFieldValue("lat", new DoubleFieldValue(37.7));
companyValue.setFieldValue("lon", new DoubleFieldValue(-122.44));
companyValue.setFieldValue("vertical", new StringFieldValue("software"));
Annotation compAn = new Annotation(company, companyValue);
tree.annotate(companySpan, compAn);
Struct personValue = new Struct(manager.getDataType("annotation.person"));
personValue.setFieldValue("name", new StringFieldValue("Richard Bair"));
Annotation personAn = new Annotation(person, personValue);
tree.annotate(personSpan, personAn);
Struct locValue = new Struct(manager.getDataType("annotation.location"));
locValue.setFieldValue("name", new StringFieldValue("Prinsens Gate"));
Annotation loc = new Annotation(location, locValue);
tree.annotate(locationSpan, loc);
Struct locValue2 = new Struct(manager.getDataType("annotation.location"));
locValue2.setFieldValue("name", new StringFieldValue("Kongens Gate"));
Annotation locAn = new Annotation(location, locValue2);
tree.annotate(locationSpan, locAn);
SpanList branch = new SpanList();
SpanNode span1 = new Span(0, 3);
SpanNode span2 = new Span(1, 9);
SpanNode span3 = new Span(12, 10);
branch.add(span1);
branch.add(span3);
branch.add(span2);
Struct industryValue = new Struct(manager.getDataType("annotation.industry"));
industryValue.setFieldValue("vertical", new StringFieldValue("Manufacturing"));
Annotation ind = new Annotation(industry, industryValue);
tree.annotate(span1, ind);
Struct pValue = new Struct(manager.getDataType("annotation.person"));
pValue.setFieldValue("name", new StringFieldValue("Praveen Mohan"));
Annotation pAn = new Annotation(person, pValue);
tree.annotate(span2, pAn);
Struct lValue = new Struct(manager.getDataType("annotation.location"));
lValue.setFieldValue("name", new StringFieldValue("Embassy Golf Links"));
Annotation locn = new Annotation(location, lValue);
tree.annotate(span3, locn);
Struct cValue = (Struct) company.getDataType().createFieldValue();
cValue.setFieldValue("name", new StringFieldValue("Yahoo"));
cValue.setFieldValue("ceo", new StringFieldValue("Carol Bartz"));
cValue.setFieldValue("lat", new DoubleFieldValue(127.7));
cValue.setFieldValue("lon", new DoubleFieldValue(-42.44));
cValue.setFieldValue("vertical", new StringFieldValue("search"));
Annotation cAn = new Annotation(company, cValue);
tree.annotate(branch, cAn);
Struct pVal = new Struct(manager.getDataType("annotation.person"));
pVal.setFieldValue("name", new StringFieldValue("Kim Omar"));
Annotation an = new Annotation(person, pVal);
tree.annotate(root, an);
root.add(branch);
StringFieldValue body = (StringFieldValue) document.getFieldValue(document.getDataType().getField("body"));
root.remove(branch);
tree.cleanup();
System.out.println("No. Of Annotations: " + tree.numAnnotations());
body.setSpanTree(tree);
document.setFieldValue(document.getField("body"), body);
return document;
}
use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.
the class StringTestCase method testNestedSpanTreeBug4187377.
@Test
public void testNestedSpanTreeBug4187377() {
AnnotationType type = new AnnotationType("ann", DataType.STRING);
StringFieldValue outerString = new StringFieldValue("Ballooo");
SpanTree outerTree = new SpanTree("outer");
outerString.setSpanTree(outerTree);
SpanList outerRoot = (SpanList) outerTree.getRoot();
Span outerSpan = new Span(0, 1);
outerRoot.add(outerSpan);
StringFieldValue innerString = new StringFieldValue("innerBalloooo");
outerTree.annotate(outerSpan, new Annotation(type, innerString));
SpanTree innerTree = new SpanTree("inner");
innerString.setSpanTree(innerTree);
SpanList innerRoot = (SpanList) innerTree.getRoot();
Span innerSpan = new Span(0, 1);
innerRoot.add(innerSpan);
innerTree.annotate(innerSpan, new Annotation(type));
GrowableByteBuffer buffer = new GrowableByteBuffer(1024);
DocumentSerializer serializer = DocumentSerializerFactory.create42(buffer);
try {
serializer.write(null, outerString);
fail("Should have failed, nested span trees are not supported.");
} catch (SerializationException se) {
// OK!
}
}
use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.
the class VespaDocumentDeserializer42 method readSpanTree.
private void readSpanTree(SpanTree tree, boolean readName) {
// we don't support serialization of nested span trees:
if (spanNodes != null || annotations != null) {
throw new SerializationException("Deserialization of nested SpanTrees is not supported.");
}
// we're going to write a new SpanTree, create a new Map for nodes:
spanNodes = new ArrayList<SpanNode>();
annotations = new ArrayList<Annotation>();
try {
if (readName) {
StringFieldValue treeName = new StringFieldValue();
treeName.deserialize(this);
tree.setName(treeName.getString());
}
SpanNode root = readSpanNode();
tree.setRoot(root);
int numAnnotations = buf.getInt1_2_4Bytes();
for (int i = 0; i < numAnnotations; i++) {
Annotation a = new Annotation();
annotations.add(a);
}
for (int i = 0; i < numAnnotations; i++) {
read(annotations.get(i));
}
for (Annotation a : annotations) {
tree.annotate(a);
}
for (SpanNode node : spanNodes) {
if (node instanceof Span) {
correctIndexes((Span) node);
}
}
} finally {
// we're done, let's set this to null to save memory and prevent madness:
spanNodes = null;
annotations = null;
}
}
use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.
the class VespaDocumentDeserializer42 method readSpanNode.
private SpanNode readSpanNode() {
byte type = buf.get();
buf.position(buf.position() - 1);
SpanNode retval;
if ((type & Span.ID) == Span.ID) {
retval = new Span();
if (spanNodes != null) {
spanNodes.add(retval);
}
read((Span) retval);
} else if ((type & SpanList.ID) == SpanList.ID) {
retval = new SpanList();
if (spanNodes != null) {
spanNodes.add(retval);
}
read((SpanList) retval);
} else if ((type & AlternateSpanList.ID) == AlternateSpanList.ID) {
retval = new AlternateSpanList();
if (spanNodes != null) {
spanNodes.add(retval);
}
read((AlternateSpanList) retval);
} else {
throw new DeserializationException("Cannot read SpanNode of type " + type);
}
return retval;
}
Aggregations