Search in sources :

Example 1 with Span

use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.

the class AnalyzedTextParser method parseSpan.

private void parseSpan(AnalysedText at, JsonNode node) throws IOException {
    if (node.isObject()) {
        ObjectNode jSpan = (ObjectNode) node;
        int[] spanPos = new int[] { -1, -1 };
        Collection<Entry<String, JsonNode>> jAnnotations = new ArrayList<Entry<String, JsonNode>>(4);
        SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations);
        if (spanType == null || spanPos[0] < 0 || spanPos[1] < 0) {
            log.warn("Illegal or missing span type, start and/or end position (ignored, json: " + jSpan);
            return;
        }
        // now create the Span
        Span span;
        switch(spanType) {
            case Text:
                log.warn("Encounterd 'Text' span that is not the first span in the " + "'spans' array (ignored, json: " + node + ")");
                return;
            case TextSection:
                log.warn("Encountered 'TextSection' span. This SpanTypeEnum entry " + "is currently unused. If this is no longer the case please " + "update this implementation (ignored, json: " + node + ")");
                return;
            case Sentence:
                span = at.addSentence(spanPos[0], spanPos[1]);
                break;
            case Chunk:
                span = at.addChunk(spanPos[0], spanPos[1]);
                break;
            case Token:
                span = at.addToken(spanPos[0], spanPos[1]);
                break;
            default:
                log.warn("Unsupported SpanTypeEnum  '" + spanType + "'!. Please " + "update this implementation (ignored, json: " + node + ")");
                return;
        }
        if (!jAnnotations.isEmpty()) {
            parseAnnotations(span, jAnnotations);
        }
    } else {
        log.warn("Unable to parse Span form JsonNode " + node + " (expected JSON object)!");
    }
}
Also used : Entry(java.util.Map.Entry) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) ObjectNode(org.codehaus.jackson.node.ObjectNode) ArrayList(java.util.ArrayList) JsonNode(org.codehaus.jackson.JsonNode) SerializedString(org.codehaus.jackson.io.SerializedString) Span(org.apache.stanbol.enhancer.nlp.model.Span)

Example 2 with Span

use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.

the class DependencyRelationSupport method parse.

@Override
public DependencyRelation parse(ObjectNode jDependencyRelation, AnalysedText at) {
    JsonNode tag = jDependencyRelation.path(RELATION_TYPE_TAG);
    if (!tag.isTextual()) {
        throw new IllegalStateException("Unable to parse GrammaticalRelationTag. The value of the " + "'tag' field MUST have a textual value (json: " + jDependencyRelation + ")");
    }
    GrammaticalRelation grammaticalRelation = GrammaticalRelation.class.getEnumConstants()[jDependencyRelation.path(RELATION_STANBOL_TYPE_TAG).asInt()];
    GrammaticalRelationTag gramRelTag = new GrammaticalRelationTag(tag.getTextValue(), grammaticalRelation);
    JsonNode isDependent = jDependencyRelation.path(RELATION_IS_DEPENDENT_TAG);
    if (!isDependent.isBoolean()) {
        throw new IllegalStateException("Field 'isDependent' must have a true/false format");
    }
    Span partnerSpan = null;
    String typeString = jDependencyRelation.path(RELATION_PARTNER_TYPE_TAG).getTextValue();
    if (!typeString.equals(ROOT_TAG)) {
        SpanTypeEnum spanType = SpanTypeEnum.valueOf(jDependencyRelation.path(RELATION_PARTNER_TYPE_TAG).getTextValue());
        int spanStart = jDependencyRelation.path(RELATION_PARTNER_START_TAG).asInt();
        int spanEnd = jDependencyRelation.path(RELATION_PARTNER_END_TAG).asInt();
        switch(spanType) {
            case Chunk:
                partnerSpan = at.addChunk(spanStart, spanEnd);
                break;
            // break;
            case Token:
                partnerSpan = at.addToken(spanStart, spanEnd);
                break;
        }
    }
    return new DependencyRelation(gramRelTag, isDependent.asBoolean(), partnerSpan);
}
Also used : SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) GrammaticalRelation(org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelation) JsonNode(org.codehaus.jackson.JsonNode) GrammaticalRelationTag(org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelationTag) Span(org.apache.stanbol.enhancer.nlp.model.Span) DependencyRelation(org.apache.stanbol.enhancer.nlp.dependency.DependencyRelation)

Example 3 with Span

use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.

the class DependencyRelationSupport method serialize.

@Override
public ObjectNode serialize(ObjectMapper mapper, DependencyRelation relation) {
    ObjectNode jDependencyRelation = mapper.createObjectNode();
    GrammaticalRelationTag gramRelTag = relation.getGrammaticalRelationTag();
    jDependencyRelation.put(RELATION_TYPE_TAG, gramRelTag.getTag());
    jDependencyRelation.put(RELATION_STANBOL_TYPE_TAG, gramRelTag.getGrammaticalRelation().ordinal());
    jDependencyRelation.put(RELATION_IS_DEPENDENT_TAG, (relation.isDependent()));
    Span partner = relation.getPartner();
    if (partner != null) {
        jDependencyRelation.put(RELATION_PARTNER_TYPE_TAG, partner.getType().toString());
        jDependencyRelation.put(RELATION_PARTNER_START_TAG, partner.getStart());
        jDependencyRelation.put(RELATION_PARTNER_END_TAG, partner.getEnd());
    } else {
        jDependencyRelation.put(RELATION_PARTNER_TYPE_TAG, ROOT_TAG);
        jDependencyRelation.put(RELATION_PARTNER_START_TAG, 0);
        jDependencyRelation.put(RELATION_PARTNER_END_TAG, 0);
    }
    return jDependencyRelation;
}
Also used : ObjectNode(org.codehaus.jackson.node.ObjectNode) GrammaticalRelationTag(org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelationTag) Span(org.apache.stanbol.enhancer.nlp.model.Span)

Example 4 with Span

use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.

the class AnalyzedTextSerializerAndParserTest method testSerialization.

@Test
public void testSerialization() throws IOException {
    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
    serializer.serialize(analysedTextWithData, bout, null);
    // get the serialized String and check for some expected elements
    byte[] data = bout.toByteArray();
    String serialized = new String(data, Charset.forName("UTF-8"));
    log.info(serialized);
    Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
    Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
    Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
    Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
    Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
    Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
    Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
    Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
    Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
    Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
    // deserialize
    AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
    AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null, atFactory.createAnalysedText(textBlob.getValue()));
    Assert.assertEquals(analysedTextWithData, parsedAt);
    Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
    Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
    while (origSpanIt.hasNext() && parsedSpanIt.hasNext()) {
        Span orig = origSpanIt.next();
        Span parsed = parsedSpanIt.next();
        Assert.assertEquals(orig, parsed);
        Set<String> origKeys = orig.getKeys();
        Set<String> parsedKeys = parsed.getKeys();
        Assert.assertEquals(origKeys, parsedKeys);
        for (String key : origKeys) {
            List<Value<?>> origValues = orig.getValues(key);
            List<Value<?>> parsedValues = parsed.getValues(key);
            Assert.assertEquals(origValues, parsedValues);
        }
    }
    Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans", origSpanIt.hasNext());
    Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans", parsedSpanIt.hasNext());
}
Also used : SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Span(org.apache.stanbol.enhancer.nlp.model.Span) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) ByteArrayInputStream(java.io.ByteArrayInputStream) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) Test(org.junit.Test)

Example 5 with Span

use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.

the class SectionImpl method getEnclosed.

@Override
@SuppressWarnings("unchecked")
public Iterator<Span> getEnclosed(final Set<SpanTypeEnum> types, int startOffset, int endOffset) {
    if (startOffset >= (span[1] - span[0])) {
        // start is outside the span
        return Collections.<Span>emptySet().iterator();
    }
    int startIdx = startOffset < 0 ? span[0] : (span[0] + startOffset);
    int endIdx = span[0] + endOffset;
    if (endIdx <= startIdx) {
        return Collections.<Span>emptySet().iterator();
    } else if (endIdx > span[1]) {
        endIdx = span[1];
    }
    return IteratorUtils.filteredIterator(getIterator(new SubSetHelperSpan(startIdx, endIdx)), new Predicate() {

        @Override
        public boolean evaluate(Object span) {
            return types.contains(((Span) span).getType());
        }
    });
}
Also used : Span(org.apache.stanbol.enhancer.nlp.model.Span) Predicate(org.apache.commons.collections.Predicate) InstanceofPredicate(org.apache.commons.collections.functors.InstanceofPredicate)

Aggregations

Span (org.apache.stanbol.enhancer.nlp.model.Span)21 SpanTypeEnum (org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum)8 IRI (org.apache.clerezza.commons.rdf.IRI)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 HashSet (java.util.HashSet)4 ObjectNode (org.codehaus.jackson.node.ObjectNode)4 ArrayList (java.util.ArrayList)3 Graph (org.apache.clerezza.commons.rdf.Graph)3 Language (org.apache.clerezza.commons.rdf.Language)3 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)3 NounPhrase (org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase)3 CorefFeature (org.apache.stanbol.enhancer.nlp.coref.CorefFeature)3 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)3 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)3 PlaceAdjectival (org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival)2 GrammaticalRelationTag (org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelationTag)2 Token (org.apache.stanbol.enhancer.nlp.model.Token)2 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)2