Search in sources :

Example 6 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.

@Test
public void testEngineDe() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
    Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
    //Add some Tokens with POS annotations to test the usage of
    //existing POS annotations by the lemmatizer
    Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
    verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
    Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
    schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
    Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
    urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    //compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        //deactivate test
        return;
    }
    //now validate the enhancements
    boolean foundVerb = false;
    boolean foundAdjective = false;
    boolean foundNoun = false;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
        if (de_verb.equals(token.getSpan())) {
            foundVerb = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
        } else if (de_adjective.equals(token.getSpan())) {
            foundAdjective = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
        } else if (de_noun.equals(token.getSpan())) {
            foundNoun = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
        }
        for (Value<MorphoFeatures> mf : mfs) {
            log.info("  - {}", mf);
            Assert.assertNotNull(mf.value().getLemma());
        }
    }
    Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
    Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
    Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
Also used : PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 7 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
    for (LexicalEntry term : terms) {
        if (term.getTermReadings().isEmpty()) {
            //ignore terms without readings
            continue;
        }
        //Add the LexicalEntry as Token to the Text. NOTE that if a
        //Token with the same start/end positions already exist this
        //Method returns the existing instance
        Token token = at.addToken(term.getFrom(), term.getTo());
        //Now try to get POS annotations for the Token
        for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
            if (posAnno.value().isMapped()) {
                for (LexicalCategory cat : posAnno.value().getCategories()) {
                    if (!tokenLexCats.containsKey(cat)) {
                        //do not override with lover prob
                        tokenLexCats.put(cat, posAnno.probability());
                    }
                }
            }
        }
        for (Reading reading : term.getTermReadings()) {
            MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
            //add the readings (MorphoFeatures)
            if (mf != null) {
                //use the POS tags of the morpho analysis and compare it
                //with existing POS tags.
                double posProbability = -1;
                Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                for (PosTag mfPos : mf.getPosList()) {
                    mfCats.addAll(mfPos.getCategories());
                }
                for (LexicalCategory mfCat : mfCats) {
                    Double prob = tokenLexCats.get(mfCat);
                    if (prob != null && posProbability < prob) {
                        posProbability = prob;
                    }
                }
                //add the morpho features with the posProbabiliy
                Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
            }
        }
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) SOAPException(javax.xml.soap.SOAPException) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) EnumMap(java.util.EnumMap)

Example 8 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class OpenNlpChunkingEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfiguration, language, true);
    ChunkerME chunker = initChunker(language);
    if (chunker == null) {
        return;
    }
    //init the Phrase TagSet
    TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
    }
    if (tagSet == null) {
        log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        //for now only created to avoid checks for tagSet == null
        //TODO: in future we might want to automatically create posModels based
        //on tagged texts. However this makes no sense as long we can not
        //persist TagSets.
        tagSet = new TagSet<PhraseTag>("dummy", language);
    }
    //holds PosTags created for POS tags that where not part of the posModel
    //(will hold all PosTags in case tagSet is NULL
    Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PhraseTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    ci.getLock().writeLock().lock();
    try {
        Iterator<? extends Section> sentences = at.getSentences();
        if (!sentences.hasNext()) {
            //no sentences ... iterate over the whole text
            sentences = Collections.singleton(at).iterator();
        }
        List<String> tokenTextList = new ArrayList<String>(64);
        List<String> posList = new ArrayList<String>(64);
        List<Token> tokenList = new ArrayList<Token>(64);
        //process each sentence seperatly
        while (sentences.hasNext()) {
            // (1) get Tokens and POS information for the sentence
            Section sentence = sentences.next();
            Iterator<Token> tokens = sentence.getTokens();
            while (tokens.hasNext()) {
                Token token = tokens.next();
                tokenList.add(token);
                tokenTextList.add(token.getSpan());
                Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                if (posValue == null) {
                    throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
                } else {
                    posList.add(posValue.value().getTag());
                }
            }
            String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
            String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
            if (log.isTraceEnabled()) {
                log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
            }
            //free memory
            tokenTextList.clear();
            //free memory
            posList.clear();
            // (2) Chunk the sentence
            String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
            double[] chunkProb = chunker.probs();
            if (log.isTraceEnabled()) {
                log.trace("Chunks: {}" + Arrays.toString(chunkTags));
            }
            //free memory
            tokenStrings = null;
            //free memory
            tokenPos = null;
            // (3) Process the results and write the Annotations
            double chunkProps = 0;
            int chunkTokenCount = 0;
            PhraseTag tag = null;
            int i;
            /*
                 * This assumes:
                 *  - 'B-{tag}' ... for start of a new chunk
                 *  - '???' ... anything other for continuing the current chunk
                 *  - 'O' ... no chunk (ends current chunk)
                 */
            for (i = 0; i < tokenList.size(); i++) {
                boolean start = chunkTags[i].charAt(0) == 'B';
                boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
                if (end) {
                    //add the current phrase
                    //add at AnalysedText level, because offsets are absolute
                    //NOTE we are already at the next token when we detect the end
                    Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                    chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
                    //reset the state
                    tag = null;
                    chunkTokenCount = 0;
                    chunkProps = 0;
                }
                if (start) {
                    //create the new tag
                    tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), //skip 'B-'
                    language);
                }
                if (tag != null) {
                    //count this token for the current chunk
                    chunkProps = chunkProps + chunkProb[i];
                    chunkTokenCount++;
                }
            }
            if (tag != null) {
                Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
            }
            // (4) clean up
            tokenList.clear();
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
    if (log.isTraceEnabled()) {
        logChunks(at);
    }
}
Also used : ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) ChunkerME(opennlp.tools.chunker.ChunkerME)

Example 9 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

/**
     * THis method extracts NamedEntity occurrences by using existing {@link Token}s and 
     * {@link Sentence}s in the parsed {@link AnalysedText}.
     * @param nameFinderModel the model used to find NamedEntities
     * @param at the Analysed Text
     * @param language the language of the text
     * @return the found named Entity Occurrences
     */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) {
        //no sentence annotations
        //process as a single section
        sentences.add(at);
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        //three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());
        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext(); ) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            //create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            //add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            //TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList)

Example 10 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class OpenNlpPosTaggingEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, true);
    POSTagger posTagger = getPOSTagger(language);
    if (posTagger == null) {
        //the call to canEnhance and computeEnhancement
        throw new EngineException("PosTagger for langauge '" + language + "is not available.");
    }
    TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
        log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        //for now only created to avoid checks for tagSet == null
        //TODO: in future we might want to automatically create posModels based
        //on tagged texts. However this makes no sense as long we can not
        //persist TagSets.
        tagSet = new TagSet<PosTag>("dummy", language);
    }
    //holds PosTags created for POS tags that where not part of the posModel
    //(will hold all PosTags in case tagSet is NULL
    Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PosTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    //(1) Sentence detection
    //Try to read existing Sentence Annotations
    Iterator<Sentence> sentences = at.getSentences();
    List<Section> sentenceList;
    if (!sentences.hasNext()) {
        //if non try to detect sentences
        log.trace(" > detect sentences for {}", at);
        sentenceList = detectSentences(at, language);
    }
    if (sentences.hasNext()) {
        //check if we have detected sentences
        log.trace(" > use existing Sentence annotations for {}", at);
        sentenceList = new ArrayList<Section>();
        AnalysedTextUtils.appandToList(sentences, sentenceList);
    } else {
        //no sentence detected ... treat the whole text as a single sentence
        //TODO: maybe apply here a limit to the text size!
        log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
        sentenceList = Collections.singletonList((Section) at);
    }
    //for all sentences (or the whole Text - if no sentences available)
    for (Section sentence : sentenceList) {
        //(2) Tokenize Sentences
        List<Token> tokenList;
        //check if there are already tokens
        Iterator<Token> tokens = sentence.getTokens();
        if (!tokens.hasNext()) {
            //no tokens present -> tokenize
            log.trace(" > tokenize {}", sentence);
            tokenList = tokenize(sentence, language);
        } else {
            //use existing
            log.trace(" > use existing Tokens for {}", sentence);
            //ensure an ArrayList is used
            tokenList = new ArrayList<Token>();
            AnalysedTextUtils.appandToList(tokens, tokenList);
        }
        //(3) POS Tagging
        posTag(tokenList, posTagger, tagSet, adhocTags, language);
    }
    if (log.isTraceEnabled()) {
        logAnnotations(at);
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) POSTagger(opennlp.tools.postag.POSTagger) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Aggregations

Token (org.apache.stanbol.enhancer.nlp.model.Token)23 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)13 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)12 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)9 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)8 ArrayList (java.util.ArrayList)7 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)7 Section (org.apache.stanbol.enhancer.nlp.model.Section)5 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)5 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)5 IOException (java.io.IOException)4 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)4 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)4 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)4 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)4 IRI (org.apache.clerezza.commons.rdf.IRI)3 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)3 MorphoFeatures (org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures)3 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)3 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)3