Search in sources :

Example 1 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class SmartcnTokenizerEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
        throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    if (!at.getSentences().hasNext()) {
        // no sentences  ... use this engine to detect
        // first the sentences
        TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
        try {
            while (sentences.incrementToken()) {
                OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                if (log.isTraceEnabled()) {
                    log.trace("detected {}:{}", s, s.getSpan());
                }
            }
        } catch (IOException e) {
            String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
            log.error(message, e);
            throw new EngineException(this, ci, message, e);
        }
    }
    // now the tokens
    TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
    try {
        tokens.reset();
        while (tokens.incrementToken()) {
            OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
            Token t = at.addToken(offset.startOffset(), offset.endOffset());
            log.trace("detected {}", t);
        }
    } catch (IOException e) {
        String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
        log.error(message, e);
        throw new EngineException(this, ci, message, e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) WordTokenFilter(org.apache.lucene.analysis.cn.smart.WordTokenFilter) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) SentenceTokenizer(org.apache.lucene.analysis.cn.smart.SentenceTokenizer) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 2 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class AnalyzedTextSerializerAndParserTest method setup.

@BeforeClass
public static final void setup() throws IOException {
    ci = ciFactory.createContentItem(new StringSource(text));
    textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    analysedTextWithData = createAnalysedText();
    int sentence = text.indexOf('.') + 1;
    Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
    expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
    Token the = sent1.addToken(0, 3);
    expectedTokens.put(the, "The");
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
    Token stanbol = sent1.addToken(4, 11);
    expectedTokens.put(stanbol, "Stanbol");
    stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
    // use index to create Tokens
    int enhancerStart = sent1.getSpan().indexOf("enhancer");
    Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
    expectedTokens.put(enhancer, "enhancer");
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
    MorphoFeatures morpho = new MorphoFeatures("enhance");
    morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
    morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
    morpho.addDefinitness(Definitness.Definite);
    morpho.addPerson(Person.First);
    morpho.addPos(new PosTag("PN", Pos.ProperNoun));
    morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
    morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
    morpho.addTense(new TenseTag("test-tense", Tense.Present));
    morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
    enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
    // create a chunk
    Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
    expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
    stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
    stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
Also used : CaseTag(org.apache.stanbol.enhancer.nlp.morpho.CaseTag) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) Token(org.apache.stanbol.enhancer.nlp.model.Token) VerbMoodTag(org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) NumberTag(org.apache.stanbol.enhancer.nlp.morpho.NumberTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TenseTag(org.apache.stanbol.enhancer.nlp.morpho.TenseTag) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) GenderTag(org.apache.stanbol.enhancer.nlp.morpho.GenderTag) BeforeClass(org.junit.BeforeClass)

Example 3 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class OpenNlpPosTaggingEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, true);
    POSTagger posTagger = getPOSTagger(language);
    if (posTagger == null) {
        // the call to canEnhance and computeEnhancement
        throw new EngineException("PosTagger for langauge '" + language + "is not available.");
    }
    TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
        log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        // for now only created to avoid checks for tagSet == null
        // TODO: in future we might want to automatically create posModels based
        // on tagged texts. However this makes no sense as long we can not
        // persist TagSets.
        tagSet = new TagSet<PosTag>("dummy", language);
    }
    // holds PosTags created for POS tags that where not part of the posModel
    // (will hold all PosTags in case tagSet is NULL
    Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PosTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    // (1) Sentence detection
    // Try to read existing Sentence Annotations
    Iterator<Sentence> sentences = at.getSentences();
    List<Section> sentenceList;
    if (!sentences.hasNext()) {
        // if non try to detect sentences
        log.trace(" > detect sentences for {}", at);
        sentenceList = detectSentences(at, language);
    }
    if (sentences.hasNext()) {
        // check if we have detected sentences
        log.trace(" > use existing Sentence annotations for {}", at);
        sentenceList = new ArrayList<Section>();
        AnalysedTextUtils.appandToList(sentences, sentenceList);
    } else {
        // no sentence detected ... treat the whole text as a single sentence
        // TODO: maybe apply here a limit to the text size!
        log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
        sentenceList = Collections.singletonList((Section) at);
    }
    // for all sentences (or the whole Text - if no sentences available)
    for (Section sentence : sentenceList) {
        // (2) Tokenize Sentences
        List<Token> tokenList;
        // check if there are already tokens
        Iterator<Token> tokens = sentence.getTokens();
        if (!tokens.hasNext()) {
            // no tokens present -> tokenize
            log.trace(" > tokenize {}", sentence);
            tokenList = tokenize(sentence, language);
        } else {
            // use existing
            log.trace(" > use existing Tokens for {}", sentence);
            // ensure an ArrayList is used
            tokenList = new ArrayList<Token>();
            AnalysedTextUtils.appandToList(tokens, tokenList);
        }
        // (3) POS Tagging
        posTag(tokenList, posTagger, tagSet, adhocTags, language);
    }
    if (log.isTraceEnabled()) {
        logAnnotations(at);
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) POSTagger(opennlp.tools.postag.POSTagger) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 4 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class OpenNlpSentenceDetectionEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, true);
    SentenceDetector sentenceDetector = getSentenceDetector(language);
    if (sentenceDetector != null) {
        for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
            // detect sentences and add it to the AnalyzedText.
            Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
            log.trace(" > add {}", sentence);
        }
    } else {
        log.warn("SentenceDetector model for language {} is no longer available. " + "This might happen if the model becomes unavailable during enhancement. " + "If this happens more often it might also indicate an bug in the used " + "EnhancementJobManager implementation as the availability is also checked " + "in the canEnhance(..) method of this Enhancement Engine.");
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) SentenceDetector(opennlp.tools.sentdetect.SentenceDetector) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 5 with Sentence

use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)14 Token (org.apache.stanbol.enhancer.nlp.model.Token)9 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)8 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)6 IRI (org.apache.clerezza.commons.rdf.IRI)5 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)5 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)5 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)5 IOException (java.io.IOException)4 Graph (org.apache.clerezza.commons.rdf.Graph)4 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)4 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)4 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)4 ArrayList (java.util.ArrayList)3 CharSequenceReader (org.apache.commons.io.input.CharSequenceReader)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)3 Span (org.apache.stanbol.enhancer.nlp.model.Span)3 SentenceDetector (opennlp.tools.sentdetect.SentenceDetector)2 Language (org.apache.clerezza.commons.rdf.Language)2