Search in sources :

Example 1 with SpanTypeEnum

use of org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum in project stanbol by apache.

the class Nlp2RdfMetadataEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String lang = EnhancementEngineHelper.getLanguage(ci);
    Language language = lang == null ? null : new Language(lang);
    //now iterate over the AnalysedText data and create the RDF representation
    //TODO: make configureable
    boolean sentences = true;
    boolean phrases = true;
    boolean words = true;
    EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
    if (sentences) {
        activeTypes.add(SpanTypeEnum.Sentence);
    }
    if (phrases) {
        activeTypes.add(SpanTypeEnum.Chunk);
    }
    if (words) {
        activeTypes.add(SpanTypeEnum.Token);
    }
    Graph metadata = ci.getMetadata();
    IRI base = ci.getUri();
    ci.getLock().writeLock().lock();
    try {
        Iterator<Span> spans = at.getEnclosed(activeTypes);
        IRI sentence = null;
        IRI phrase = null;
        IRI word = null;
        boolean firstWordInSentence = true;
        while (spans.hasNext()) {
            Span span = spans.next();
            //TODO: filter Spans based on additional requirements
            //(1) write generic information about the span
            IRI current = writeSpan(metadata, base, at, language, span);
            //(2) add the relations between the different spans
            switch(span.getType()) {
                case Sentence:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(sentence, SsoOntology.nextSentence.getUri(), current));
                    }
                    sentence = current;
                    firstWordInSentence = true;
                    break;
                case Chunk:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(current, StringOntology.superString.getUri(), sentence));
                        if (word != null) {
                            metadata.add(new TripleImpl(word, SsoOntology.lastWord.getUri(), sentence));
                        }
                    }
                    phrase = current;
                    break;
                case Token:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(current, SsoOntology.sentence.getUri(), sentence));
                        if (firstWordInSentence) {
                            metadata.add(new TripleImpl(current, SsoOntology.firstWord.getUri(), sentence));
                            firstWordInSentence = false;
                        }
                    }
                    if (phrase != null) {
                        metadata.add(new TripleImpl(current, SsoOntology.parent.getUri(), phrase));
                    }
                    if (word != null) {
                        metadata.add(new TripleImpl(word, SsoOntology.nextWord.getUri(), current));
                        metadata.add(new TripleImpl(current, SsoOntology.previousWord.getUri(), word));
                    }
                    word = current;
                    break;
                default:
                    break;
            }
            //(3) add specific information such as POS, chunk type ...
            writePos(metadata, span, current);
            writePhrase(metadata, span, current);
            //OlIA does not include Sentiments
            Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
            if (sentiment != null && sentiment.value() != null) {
                metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) NIFHelper.writeSpan(org.apache.stanbol.enhancer.nlp.utils.NIFHelper.writeSpan) Span(org.apache.stanbol.enhancer.nlp.model.Span) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 2 with SpanTypeEnum

use of org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum in project stanbol by apache.

the class AnalyzedTextParser method parseSpan.

private void parseSpan(AnalysedText at, JsonNode node) throws IOException {
    if (node.isObject()) {
        ObjectNode jSpan = (ObjectNode) node;
        int[] spanPos = new int[] { -1, -1 };
        Collection<Entry<String, JsonNode>> jAnnotations = new ArrayList<Entry<String, JsonNode>>(4);
        SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations);
        if (spanType == null || spanPos[0] < 0 || spanPos[1] < 0) {
            log.warn("Illegal or missing span type, start and/or end position (ignored, json: " + jSpan);
            return;
        }
        //now create the Span
        Span span;
        switch(spanType) {
            case Text:
                log.warn("Encounterd 'Text' span that is not the first span in the " + "'spans' array (ignored, json: " + node + ")");
                return;
            case TextSection:
                log.warn("Encountered 'TextSection' span. This SpanTypeEnum entry " + "is currently unused. If this is no longer the case please " + "update this implementation (ignored, json: " + node + ")");
                return;
            case Sentence:
                span = at.addSentence(spanPos[0], spanPos[1]);
                break;
            case Chunk:
                span = at.addChunk(spanPos[0], spanPos[1]);
                break;
            case Token:
                span = at.addToken(spanPos[0], spanPos[1]);
                break;
            default:
                log.warn("Unsupported SpanTypeEnum  '" + spanType + "'!. Please " + "update this implementation (ignored, json: " + node + ")");
                return;
        }
        if (!jAnnotations.isEmpty()) {
            parseAnnotations(span, jAnnotations);
        }
    } else {
        log.warn("Unable to parse Span form JsonNode " + node + " (expected JSON object)!");
    }
}
Also used : Entry(java.util.Map.Entry) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) ObjectNode(org.codehaus.jackson.node.ObjectNode) ArrayList(java.util.ArrayList) JsonNode(org.codehaus.jackson.JsonNode) SerializedString(org.codehaus.jackson.io.SerializedString) Span(org.apache.stanbol.enhancer.nlp.model.Span)

Example 3 with SpanTypeEnum

use of org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum in project stanbol by apache.

the class DependencyRelationSupport method parse.

@Override
public DependencyRelation parse(ObjectNode jDependencyRelation, AnalysedText at) {
    JsonNode tag = jDependencyRelation.path(RELATION_TYPE_TAG);
    if (!tag.isTextual()) {
        throw new IllegalStateException("Unable to parse GrammaticalRelationTag. The value of the " + "'tag' field MUST have a textual value (json: " + jDependencyRelation + ")");
    }
    GrammaticalRelation grammaticalRelation = GrammaticalRelation.class.getEnumConstants()[jDependencyRelation.path(RELATION_STANBOL_TYPE_TAG).asInt()];
    GrammaticalRelationTag gramRelTag = new GrammaticalRelationTag(tag.getTextValue(), grammaticalRelation);
    JsonNode isDependent = jDependencyRelation.path(RELATION_IS_DEPENDENT_TAG);
    if (!isDependent.isBoolean()) {
        throw new IllegalStateException("Field 'isDependent' must have a true/false format");
    }
    Span partnerSpan = null;
    String typeString = jDependencyRelation.path(RELATION_PARTNER_TYPE_TAG).getTextValue();
    if (!typeString.equals(ROOT_TAG)) {
        SpanTypeEnum spanType = SpanTypeEnum.valueOf(jDependencyRelation.path(RELATION_PARTNER_TYPE_TAG).getTextValue());
        int spanStart = jDependencyRelation.path(RELATION_PARTNER_START_TAG).asInt();
        int spanEnd = jDependencyRelation.path(RELATION_PARTNER_END_TAG).asInt();
        switch(spanType) {
            case Chunk:
                partnerSpan = at.addChunk(spanStart, spanEnd);
                break;
            // break;
            case Token:
                partnerSpan = at.addToken(spanStart, spanEnd);
                break;
        }
    }
    return new DependencyRelation(gramRelTag, isDependent.asBoolean(), partnerSpan);
}
Also used : SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) GrammaticalRelation(org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelation) JsonNode(org.codehaus.jackson.JsonNode) GrammaticalRelationTag(org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelationTag) Span(org.apache.stanbol.enhancer.nlp.model.Span) DependencyRelation(org.apache.stanbol.enhancer.nlp.dependency.DependencyRelation)

Example 4 with SpanTypeEnum

use of org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum in project stanbol by apache.

the class Nif20MetadataEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String lang = EnhancementEngineHelper.getLanguage(ci);
    Language language = lang == null ? null : new Language(lang);
    //now iterate over the AnalysedText data and create the RDF representation
    //TODO: make configureable
    boolean sentences = true;
    boolean phrases = true;
    boolean words = true;
    EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
    if (sentences) {
        activeTypes.add(SpanTypeEnum.Sentence);
    }
    if (phrases) {
        activeTypes.add(SpanTypeEnum.Chunk);
    }
    if (words) {
        activeTypes.add(SpanTypeEnum.Token);
    }
    Graph metadata = ci.getMetadata();
    IRI base = ci.getUri();
    ci.getLock().writeLock().lock();
    try {
        //write the context
        IRI text = writeSpan(metadata, base, at, language, at);
        metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
        Iterator<Span> spans = at.getEnclosed(activeTypes);
        IRI sentence = null;
        IRI phrase = null;
        IRI word = null;
        boolean firstWordInSentence = true;
        while (spans.hasNext()) {
            Span span = spans.next();
            //TODO: filter Spans based on additional requirements
            //(1) write generic information about the span
            IRI current = writeSpan(metadata, base, at, language, span);
            //write the context
            metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
            //(2) add the relations between the different spans
            switch(span.getType()) {
                case Sentence:
                    if (sentence != null && writePrevNext) {
                        metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
                        metadata.add(new TripleImpl(current, Nif20.previousSentence.getUri(), sentence));
                    }
                    if (word != null) {
                        metadata.add(new TripleImpl(sentence, Nif20.lastWord.getUri(), word));
                    }
                    sentence = current;
                    firstWordInSentence = true;
                    break;
                case Chunk:
                    if (sentence != null && writeHierary) {
                        metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
                    }
                    phrase = current;
                    break;
                case Token:
                    if (sentence != null) {
                        if (writeHierary) {
                            metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
                        }
                        //metadata.add(new TripleImpl(sentence, Nif20.word.getUri(), current));
                        if (firstWordInSentence) {
                            metadata.add(new TripleImpl(sentence, Nif20.firstWord.getUri(), current));
                            firstWordInSentence = false;
                        }
                    }
                    if (writeHierary && phrase != null && !phrase.equals(current)) {
                        metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
                    }
                    if (word != null && writePrevNext) {
                        metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
                        metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
                    }
                    word = current;
                    break;
                default:
                    break;
            }
            //(3) add specific information such as POS, chunk type ...
            Nif20Helper.writePhrase(metadata, span, current);
            Nif20Helper.writePos(metadata, span, current);
            //TODO: sentiment support
            Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
            if (sentiment != null && sentiment.value() != null) {
                metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) Span(org.apache.stanbol.enhancer.nlp.model.Span) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 5 with SpanTypeEnum

use of org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum in project stanbol by apache.

the class AnalyzedTextParser method parseAnalyzedTextSpan.

private void parseAnalyzedTextSpan(JsonNode node, AnalysedText at) throws IOException {
    if (node.isObject()) {
        ObjectNode jSpan = (ObjectNode) node;
        int[] spanPos = new int[] { -1, -1 };
        Collection<Entry<String, JsonNode>> jAnnotations = new ArrayList<Entry<String, JsonNode>>(4);
        SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations);
        if (spanType != SpanTypeEnum.Text || spanPos[0] != 0 || spanPos[1] < 0) {
            throw new IOException("The AnalyzedText span MUST have the SpanType 'text', a " + "start position of '0' and an end position (ignored, json: " + jSpan);
        }
        if (at.getEnd() != spanPos[1]) {
            throw new IOException("The size of the local text '" + at.getEnd() + "' does not " + "match the span of the parsed AnalyzedText [" + spanPos[0] + "," + spanPos[1] + "]!");
        }
        parseAnnotations(at, jAnnotations);
    } else {
        throw new IOException("Unable to parse AnalyzedText span form JsonNode " + node + " (expected JSON object)!");
    }
}
Also used : Entry(java.util.Map.Entry) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) ObjectNode(org.codehaus.jackson.node.ObjectNode) ArrayList(java.util.ArrayList) JsonNode(org.codehaus.jackson.JsonNode) SerializedString(org.codehaus.jackson.io.SerializedString) IOException(java.io.IOException)

Aggregations

SpanTypeEnum (org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum)6 Span (org.apache.stanbol.enhancer.nlp.model.Span)5 JsonNode (org.codehaus.jackson.JsonNode)4 ObjectNode (org.codehaus.jackson.node.ObjectNode)3 ArrayList (java.util.ArrayList)2 Entry (java.util.Map.Entry)2 Graph (org.apache.clerezza.commons.rdf.Graph)2 IRI (org.apache.clerezza.commons.rdf.IRI)2 Language (org.apache.clerezza.commons.rdf.Language)2 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)2 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)2 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)2 SerializedString (org.codehaus.jackson.io.SerializedString)2 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 CorefFeature (org.apache.stanbol.enhancer.nlp.coref.CorefFeature)1 DependencyRelation (org.apache.stanbol.enhancer.nlp.dependency.DependencyRelation)1 GrammaticalRelation (org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelation)1 GrammaticalRelationTag (org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelationTag)1 NIFHelper.writeSpan (org.apache.stanbol.enhancer.nlp.utils.NIFHelper.writeSpan)1