Search in sources :

Example 1 with Span

use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.

the class LinguisticsAnnotator method addAnnotationSpan.

private static void addAnnotationSpan(String input, SpanList parent, Tokenizer tokenizer, Token token, StemMode mode, TermOccurrences termOccurrences) {
    if (!token.isSpecialToken()) {
        if (token.getNumComponents() > 0) {
            for (int i = 0; i < token.getNumComponents(); ++i) {
                addAnnotationSpan(input, parent, tokenizer, token.getComponent(i), mode, termOccurrences);
            }
            return;
        }
        if (!token.isIndexable()) {
            return;
        }
    }
    String orig = token.getOrig();
    int pos = (int) token.getOffset();
    if (pos >= input.length()) {
        throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which is outside the " + "bounds of the input string; " + input);
    }
    int len = orig.length();
    if (pos + len > input.length()) {
        throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which makes it overflow " + "the bounds of the input string; " + input);
    }
    if (mode == StemMode.ALL) {
        Span where = parent.span(pos, len);
        String lowercasedOrig = toLowerCase(orig);
        addAnnotation(where, orig, orig, termOccurrences);
        String lowercasedTerm = lowercasedOrig;
        String term = token.getTokenString();
        if (term != null) {
            term = tokenizer.getReplacementTerm(term);
        }
        if (term != null) {
            lowercasedTerm = toLowerCase(term);
        }
        if (!lowercasedOrig.equals(lowercasedTerm)) {
            addAnnotation(where, term, orig, termOccurrences);
        }
        for (int i = 0; i < token.getNumStems(); i++) {
            String stem = token.getStem(i);
            String lowercasedStem = toLowerCase(stem);
            if (!(lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
                addAnnotation(where, stem, orig, termOccurrences);
            }
        }
    } else {
        String term = token.getTokenString();
        if (term != null) {
            term = tokenizer.getReplacementTerm(term);
        }
        if (term == null || term.trim().isEmpty()) {
            return;
        }
        if (termOccurrences.termCountBelowLimit(term)) {
            parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
        }
    }
}
Also used : Span(com.yahoo.document.annotation.Span)

Example 2 with Span

use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.

the class StringTestCase method annotate.

public Document annotate(Document document, DocumentTypeManager manager) {
    AnnotationTypeRegistry registry = manager.getAnnotationTypeRegistry();
    AnnotationType company = registry.getType("company");
    AnnotationType industry = registry.getType("industry");
    AnnotationType person = registry.getType("person");
    AnnotationType location = registry.getType("location");
    Map<String, AnnotationType> m = registry.getTypes();
    for (String key : m.keySet()) {
        System.out.println("Key: " + key);
        AnnotationType val = m.get(key);
        parseAnnotationType(val);
    }
    SpanTree tree = new SpanTree("testannotations");
    SpanList root = (SpanList) tree.getRoot();
    SpanNode companySpan = new Span(0, 5);
    SpanNode industrySpan = new Span(5, 10);
    SpanNode personSpan = new Span(10, 15);
    SpanNode locationSpan = new Span(15, 20);
    root.add(companySpan);
    root.add(industrySpan);
    root.add(personSpan);
    root.add(locationSpan);
    Struct companyValue = (Struct) company.getDataType().createFieldValue();
    companyValue.setFieldValue("name", new StringFieldValue("Sun"));
    companyValue.setFieldValue("ceo", new StringFieldValue("Scott Mcnealy"));
    companyValue.setFieldValue("lat", new DoubleFieldValue(37.7));
    companyValue.setFieldValue("lon", new DoubleFieldValue(-122.44));
    companyValue.setFieldValue("vertical", new StringFieldValue("software"));
    Annotation compAn = new Annotation(company, companyValue);
    tree.annotate(companySpan, compAn);
    Struct personValue = new Struct(manager.getDataType("annotation.person"));
    personValue.setFieldValue("name", new StringFieldValue("Richard Bair"));
    Annotation personAn = new Annotation(person, personValue);
    tree.annotate(personSpan, personAn);
    Struct locValue = new Struct(manager.getDataType("annotation.location"));
    locValue.setFieldValue("name", new StringFieldValue("Prinsens Gate"));
    Annotation loc = new Annotation(location, locValue);
    tree.annotate(locationSpan, loc);
    Struct locValue2 = new Struct(manager.getDataType("annotation.location"));
    locValue2.setFieldValue("name", new StringFieldValue("Kongens Gate"));
    Annotation locAn = new Annotation(location, locValue2);
    tree.annotate(locationSpan, locAn);
    SpanList branch = new SpanList();
    SpanNode span1 = new Span(0, 3);
    SpanNode span2 = new Span(1, 9);
    SpanNode span3 = new Span(12, 10);
    branch.add(span1);
    branch.add(span3);
    branch.add(span2);
    Struct industryValue = new Struct(manager.getDataType("annotation.industry"));
    industryValue.setFieldValue("vertical", new StringFieldValue("Manufacturing"));
    Annotation ind = new Annotation(industry, industryValue);
    tree.annotate(span1, ind);
    Struct pValue = new Struct(manager.getDataType("annotation.person"));
    pValue.setFieldValue("name", new StringFieldValue("Praveen Mohan"));
    Annotation pAn = new Annotation(person, pValue);
    tree.annotate(span2, pAn);
    Struct lValue = new Struct(manager.getDataType("annotation.location"));
    lValue.setFieldValue("name", new StringFieldValue("Embassy Golf Links"));
    Annotation locn = new Annotation(location, lValue);
    tree.annotate(span3, locn);
    Struct cValue = (Struct) company.getDataType().createFieldValue();
    cValue.setFieldValue("name", new StringFieldValue("Yahoo"));
    cValue.setFieldValue("ceo", new StringFieldValue("Carol Bartz"));
    cValue.setFieldValue("lat", new DoubleFieldValue(127.7));
    cValue.setFieldValue("lon", new DoubleFieldValue(-42.44));
    cValue.setFieldValue("vertical", new StringFieldValue("search"));
    Annotation cAn = new Annotation(company, cValue);
    tree.annotate(branch, cAn);
    Struct pVal = new Struct(manager.getDataType("annotation.person"));
    pVal.setFieldValue("name", new StringFieldValue("Kim Omar"));
    Annotation an = new Annotation(person, pVal);
    tree.annotate(root, an);
    root.add(branch);
    StringFieldValue body = (StringFieldValue) document.getFieldValue(document.getDataType().getField("body"));
    root.remove(branch);
    tree.cleanup();
    System.out.println("No. Of Annotations: " + tree.numAnnotations());
    body.setSpanTree(tree);
    document.setFieldValue(document.getField("body"), body);
    return document;
}
Also used : SpanNode(com.yahoo.document.annotation.SpanNode) SpanList(com.yahoo.document.annotation.SpanList) Span(com.yahoo.document.annotation.Span) AnnotationTypeRegistry(com.yahoo.document.annotation.AnnotationTypeRegistry) AnnotationType(com.yahoo.document.annotation.AnnotationType) Annotation(com.yahoo.document.annotation.Annotation) SpanTree(com.yahoo.document.annotation.SpanTree)

Example 3 with Span

use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.

the class StringTestCase method testNestedSpanTreeBug4187377.

@Test
public void testNestedSpanTreeBug4187377() {
    AnnotationType type = new AnnotationType("ann", DataType.STRING);
    StringFieldValue outerString = new StringFieldValue("Ballooo");
    SpanTree outerTree = new SpanTree("outer");
    outerString.setSpanTree(outerTree);
    SpanList outerRoot = (SpanList) outerTree.getRoot();
    Span outerSpan = new Span(0, 1);
    outerRoot.add(outerSpan);
    StringFieldValue innerString = new StringFieldValue("innerBalloooo");
    outerTree.annotate(outerSpan, new Annotation(type, innerString));
    SpanTree innerTree = new SpanTree("inner");
    innerString.setSpanTree(innerTree);
    SpanList innerRoot = (SpanList) innerTree.getRoot();
    Span innerSpan = new Span(0, 1);
    innerRoot.add(innerSpan);
    innerTree.annotate(innerSpan, new Annotation(type));
    GrowableByteBuffer buffer = new GrowableByteBuffer(1024);
    DocumentSerializer serializer = DocumentSerializerFactory.create42(buffer);
    try {
        serializer.write(null, outerString);
        fail("Should have failed, nested span trees are not supported.");
    } catch (SerializationException se) {
    // OK!
    }
}
Also used : GrowableByteBuffer(com.yahoo.io.GrowableByteBuffer) SpanList(com.yahoo.document.annotation.SpanList) Span(com.yahoo.document.annotation.Span) AnnotationType(com.yahoo.document.annotation.AnnotationType) Annotation(com.yahoo.document.annotation.Annotation) SpanTree(com.yahoo.document.annotation.SpanTree) Test(org.junit.Test) AbstractTypesTest(com.yahoo.document.annotation.AbstractTypesTest)

Example 4 with Span

use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.

the class VespaDocumentDeserializer42 method readSpanTree.

private void readSpanTree(SpanTree tree, boolean readName) {
    // we don't support serialization of nested span trees:
    if (spanNodes != null || annotations != null) {
        throw new SerializationException("Deserialization of nested SpanTrees is not supported.");
    }
    // we're going to write a new SpanTree, create a new Map for nodes:
    spanNodes = new ArrayList<SpanNode>();
    annotations = new ArrayList<Annotation>();
    try {
        if (readName) {
            StringFieldValue treeName = new StringFieldValue();
            treeName.deserialize(this);
            tree.setName(treeName.getString());
        }
        SpanNode root = readSpanNode();
        tree.setRoot(root);
        int numAnnotations = buf.getInt1_2_4Bytes();
        for (int i = 0; i < numAnnotations; i++) {
            Annotation a = new Annotation();
            annotations.add(a);
        }
        for (int i = 0; i < numAnnotations; i++) {
            read(annotations.get(i));
        }
        for (Annotation a : annotations) {
            tree.annotate(a);
        }
        for (SpanNode node : spanNodes) {
            if (node instanceof Span) {
                correctIndexes((Span) node);
            }
        }
    } finally {
        // we're done, let's set this to null to save memory and prevent madness:
        spanNodes = null;
        annotations = null;
    }
}
Also used : StringFieldValue(com.yahoo.document.datatypes.StringFieldValue) SpanNode(com.yahoo.document.annotation.SpanNode) Span(com.yahoo.document.annotation.Span) Annotation(com.yahoo.document.annotation.Annotation)

Example 5 with Span

use of com.yahoo.document.annotation.Span in project vespa by vespa-engine.

the class VespaDocumentDeserializer42 method readSpanNode.

private SpanNode readSpanNode() {
    byte type = buf.get();
    buf.position(buf.position() - 1);
    SpanNode retval;
    if ((type & Span.ID) == Span.ID) {
        retval = new Span();
        if (spanNodes != null) {
            spanNodes.add(retval);
        }
        read((Span) retval);
    } else if ((type & SpanList.ID) == SpanList.ID) {
        retval = new SpanList();
        if (spanNodes != null) {
            spanNodes.add(retval);
        }
        read((SpanList) retval);
    } else if ((type & AlternateSpanList.ID) == AlternateSpanList.ID) {
        retval = new AlternateSpanList();
        if (spanNodes != null) {
            spanNodes.add(retval);
        }
        read((AlternateSpanList) retval);
    } else {
        throw new DeserializationException("Cannot read SpanNode of type " + type);
    }
    return retval;
}
Also used : SpanNode(com.yahoo.document.annotation.SpanNode) AlternateSpanList(com.yahoo.document.annotation.AlternateSpanList) AlternateSpanList(com.yahoo.document.annotation.AlternateSpanList) SpanList(com.yahoo.document.annotation.SpanList) Span(com.yahoo.document.annotation.Span)

Aggregations

Span (com.yahoo.document.annotation.Span)5 Annotation (com.yahoo.document.annotation.Annotation)3 SpanList (com.yahoo.document.annotation.SpanList)3 SpanNode (com.yahoo.document.annotation.SpanNode)3 AnnotationType (com.yahoo.document.annotation.AnnotationType)2 SpanTree (com.yahoo.document.annotation.SpanTree)2 AbstractTypesTest (com.yahoo.document.annotation.AbstractTypesTest)1 AlternateSpanList (com.yahoo.document.annotation.AlternateSpanList)1 AnnotationTypeRegistry (com.yahoo.document.annotation.AnnotationTypeRegistry)1 StringFieldValue (com.yahoo.document.datatypes.StringFieldValue)1 GrowableByteBuffer (com.yahoo.io.GrowableByteBuffer)1 Test (org.junit.Test)1