Search in sources :

Example 6 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class TestKuromojiNlpEngine method testEngine.

@Test
public void testEngine() throws EngineException {
    LiteralFactory lf = LiteralFactory.getInstance();
    Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
    engine.computeEnhancements(contentItem);
    //assert the results
    Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
    expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
    expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
    Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(contentItem.getMetadata(), text, expected));
    AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
    Assert.assertNotNull(at);
    List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
    Assert.assertNotNull(sentences);
    Assert.assertEquals(7, sentences.size());
    //TODO: values in the following arrays are based on the first run of the
    // engine. So this is only to detect changes in results. It can not validate
    // that the tokenization and NER detections are correct - sorry I do not
    // speak Japanese ...
    int[] expectedChunks = new int[] { 5, 3, 1, 0, 1, 2, 4 };
    int[] expectedTokens = new int[] { 25, 25, 25, 24, 33, 17, 32 };
    int sentIndex = 0;
    for (Sentence sent : sentences) {
        List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
        Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
        for (Chunk chunk : sentenceNer) {
            Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            Assert.assertNotNull(nerValue);
            Assert.assertNotNull(nerValue.value().getType());
        }
        List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
        Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
        for (Token token : tokens) {
            Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            Assert.assertNotNull(posValue);
        }
        sentIndex++;
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Test(org.junit.Test)

Example 7 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class CorefFeatureSupportTest method initCorefAnnotations.

private static void initCorefAnnotations() {
    Sentence sentence1 = at.addSentence(0, sentenceText1.indexOf(".") + 1);
    Token obama = sentence1.addToken(0, "Obama".length());
    Sentence sentence2 = at.addSentence(sentenceText1.indexOf(".") + 2, sentenceText2.indexOf(".") + 1);
    int heStartIdx = sentence2.getSpan().indexOf("He");
    Token he = sentence2.addToken(heStartIdx, heStartIdx + "He".length());
    Set<Span> obamaMentions = new HashSet<Span>();
    obamaMentions.add(he);
    obama.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(true, obamaMentions)));
    Set<Span> heMentions = new HashSet<Span>();
    heMentions.add(obama);
    he.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(false, heMentions)));
}
Also used : CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Token(org.apache.stanbol.enhancer.nlp.model.Token) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Span(org.apache.stanbol.enhancer.nlp.model.Span) HashSet(java.util.HashSet)

Example 8 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class DependencyRelationSupportTest method initDepTreeAnnotations.

private static void initDepTreeAnnotations() {
    Sentence sentence = at.addSentence(0, text.indexOf(".") + 1);
    Token obama = sentence.addToken(0, "Obama".length());
    int visitedStartIdx = sentence.getSpan().indexOf("visited");
    Token visited = sentence.addToken(visitedStartIdx, visitedStartIdx + "visited".length());
    int chinaStartIdx = sentence.getSpan().indexOf("China");
    Token china = sentence.addToken(chinaStartIdx, chinaStartIdx + "China".length());
    GrammaticalRelationTag nSubjGrammRelTag = new GrammaticalRelationTag("nsubj", GrammaticalRelation.NominalSubject);
    obama.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(nSubjGrammRelTag, true, visited)));
    GrammaticalRelationTag rootGrammRelTag = new GrammaticalRelationTag("root", GrammaticalRelation.Root);
    GrammaticalRelationTag dobjGrammRelTag = new GrammaticalRelationTag("dobj", GrammaticalRelation.DirectObject);
    visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(rootGrammRelTag, true, null)));
    visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(nSubjGrammRelTag, false, obama)));
    visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(dobjGrammRelTag, false, china)));
    china.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(dobjGrammRelTag, true, visited)));
}
Also used : Token(org.apache.stanbol.enhancer.nlp.model.Token) GrammaticalRelationTag(org.apache.stanbol.enhancer.nlp.dependency.GrammaticalRelationTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) DependencyRelation(org.apache.stanbol.enhancer.nlp.dependency.DependencyRelation)

Example 9 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class SmartcnSentenceEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
        throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    //first the sentences
    TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
    try {
        sentences.reset();
        while (sentences.incrementToken()) {
            OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
            Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
            if (log.isTraceEnabled()) {
                log.trace("detected {}:{}", s, s.getSpan());
            }
        }
    } catch (IOException e) {
        String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
        log.error(message, e);
        throw new EngineException(this, ci, message, e);
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) TokenStream(org.apache.lucene.analysis.TokenStream) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) SentenceTokenizer(org.apache.lucene.analysis.cn.smart.SentenceTokenizer) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 10 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class SentimentSummarizationEngine method writeSentimentEnhancements.

private void writeSentimentEnhancements(ContentItem ci, List<SentimentPhrase> sentimentPhrases, AnalysedText at, Language lang) {
    // TODO Auto-generated method stub
    Graph metadata = ci.getMetadata();
    Sentence currentSentence = null;
    final List<SentimentPhrase> sentencePhrases = new ArrayList<SentimentPhrase>();
    for (SentimentPhrase sentPhrase : sentimentPhrases) {
        Sentence sentence = sentPhrase.getSentence();
        if (log.isDebugEnabled()) {
            //debug sentiment info
            CharSequence phraseText = at.getText().subSequence(sentPhrase.getStartIndex(), sentPhrase.getEndIndex());
            log.debug("Write SentimentPhrase for {} (sentence: {})", phraseText, sentence == null ? "none" : sentence.getSpan().length() > 17 ? (sentence.getSpan().subSequence(0, 17) + "...") : sentence.getSpan());
            List<Sentiment> sentiments = sentPhrase.getSentiments();
            log.debug(" > {} Sentiments:", sentiments.size());
            for (int i = 0; i < sentiments.size(); i++) {
                log.debug("    {}. {}", i + 1, sentiments.get(i));
            }
        }
        if (writeSentimentPhrases) {
            IRI enh = createTextEnhancement(ci, this);
            String phraseText = at.getSpan().substring(sentPhrase.getStartIndex(), sentPhrase.getEndIndex());
            metadata.add(new TripleImpl(enh, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(phraseText, lang)));
            if (sentPhrase.getSentence() == null) {
                metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(at.getSpan(), phraseText, sentPhrase.getStartIndex()), lang)));
            } else {
                metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(sentPhrase.getSentence().getSpan(), lang)));
            }
            metadata.add(new TripleImpl(enh, ENHANCER_START, lf.createTypedLiteral(sentPhrase.getStartIndex())));
            metadata.add(new TripleImpl(enh, ENHANCER_END, lf.createTypedLiteral(sentPhrase.getEndIndex())));
            if (sentPhrase.getPositiveSentiment() != null) {
                metadata.add(new TripleImpl(enh, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(sentPhrase.getPositiveSentiment())));
            }
            if (sentPhrase.getNegativeSentiment() != null) {
                metadata.add(new TripleImpl(enh, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(sentPhrase.getNegativeSentiment())));
            }
            metadata.add(new TripleImpl(enh, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentPhrase.getSentiment())));
            //add the Sentiment type as well as the type of the SSO Ontology
            metadata.add(new TripleImpl(enh, DC_TYPE, SENTIMENT_TYPE));
            IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(SpanTypeEnum.Chunk);
            if (ssoType != null) {
                metadata.add(new TripleImpl(enh, DC_TYPE, ssoType));
            }
        }
        if (writeSentencesSentimet && sentence != null) {
            if (sentence.equals(currentSentence)) {
                sentencePhrases.add(sentPhrase);
            } else {
                writeSentiment(ci, currentSentence, sentencePhrases);
                //reset
                currentSentence = sentence;
                sentencePhrases.clear();
                sentencePhrases.add(sentPhrase);
            }
        }
    }
    if (!sentencePhrases.isEmpty()) {
        writeSentiment(ci, currentSentence, sentencePhrases);
    }
    if (writeDocumentSentiment) {
        writeSentiment(ci, at, sentimentPhrases);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) Graph(org.apache.clerezza.commons.rdf.Graph) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Aggregations

Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)14 Token (org.apache.stanbol.enhancer.nlp.model.Token)9 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)8 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)6 IRI (org.apache.clerezza.commons.rdf.IRI)5 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)5 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)5 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)5 IOException (java.io.IOException)4 Graph (org.apache.clerezza.commons.rdf.Graph)4 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)4 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)4 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)4 ArrayList (java.util.ArrayList)3 CharSequenceReader (org.apache.commons.io.input.CharSequenceReader)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)3 Span (org.apache.stanbol.enhancer.nlp.model.Span)3 SentenceDetector (opennlp.tools.sentdetect.SentenceDetector)2 Language (org.apache.clerezza.commons.rdf.Language)2