Search in sources :

Example 1 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class OpenNlpChunkingEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfiguration, language, true);
    ChunkerME chunker = initChunker(language);
    if (chunker == null) {
        return;
    }
    //init the Phrase TagSet
    TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
    }
    if (tagSet == null) {
        log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        //for now only created to avoid checks for tagSet == null
        //TODO: in future we might want to automatically create posModels based
        //on tagged texts. However this makes no sense as long we can not
        //persist TagSets.
        tagSet = new TagSet<PhraseTag>("dummy", language);
    }
    //holds PosTags created for POS tags that where not part of the posModel
    //(will hold all PosTags in case tagSet is NULL
    Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PhraseTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    ci.getLock().writeLock().lock();
    try {
        Iterator<? extends Section> sentences = at.getSentences();
        if (!sentences.hasNext()) {
            //no sentences ... iterate over the whole text
            sentences = Collections.singleton(at).iterator();
        }
        List<String> tokenTextList = new ArrayList<String>(64);
        List<String> posList = new ArrayList<String>(64);
        List<Token> tokenList = new ArrayList<Token>(64);
        //process each sentence seperatly
        while (sentences.hasNext()) {
            // (1) get Tokens and POS information for the sentence
            Section sentence = sentences.next();
            Iterator<Token> tokens = sentence.getTokens();
            while (tokens.hasNext()) {
                Token token = tokens.next();
                tokenList.add(token);
                tokenTextList.add(token.getSpan());
                Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                if (posValue == null) {
                    throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
                } else {
                    posList.add(posValue.value().getTag());
                }
            }
            String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
            String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
            if (log.isTraceEnabled()) {
                log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
            }
            //free memory
            tokenTextList.clear();
            //free memory
            posList.clear();
            // (2) Chunk the sentence
            String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
            double[] chunkProb = chunker.probs();
            if (log.isTraceEnabled()) {
                log.trace("Chunks: {}" + Arrays.toString(chunkTags));
            }
            //free memory
            tokenStrings = null;
            //free memory
            tokenPos = null;
            // (3) Process the results and write the Annotations
            double chunkProps = 0;
            int chunkTokenCount = 0;
            PhraseTag tag = null;
            int i;
            /*
                 * This assumes:
                 *  - 'B-{tag}' ... for start of a new chunk
                 *  - '???' ... anything other for continuing the current chunk
                 *  - 'O' ... no chunk (ends current chunk)
                 */
            for (i = 0; i < tokenList.size(); i++) {
                boolean start = chunkTags[i].charAt(0) == 'B';
                boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
                if (end) {
                    //add the current phrase
                    //add at AnalysedText level, because offsets are absolute
                    //NOTE we are already at the next token when we detect the end
                    Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                    chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
                    //reset the state
                    tag = null;
                    chunkTokenCount = 0;
                    chunkProps = 0;
                }
                if (start) {
                    //create the new tag
                    tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), //skip 'B-'
                    language);
                }
                if (tag != null) {
                    //count this token for the current chunk
                    chunkProps = chunkProps + chunkProb[i];
                    chunkTokenCount++;
                }
            }
            if (tag != null) {
                Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
            }
            // (4) clean up
            tokenList.clear();
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
    if (log.isTraceEnabled()) {
        logChunks(at);
    }
}
Also used : ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) ChunkerME(opennlp.tools.chunker.ChunkerME)

Example 2 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

/**
     * THis method extracts NamedEntity occurrences by using existing {@link Token}s and 
     * {@link Sentence}s in the parsed {@link AnalysedText}.
     * @param nameFinderModel the model used to find NamedEntities
     * @param at the Analysed Text
     * @param language the language of the text
     * @return the found named Entity Occurrences
     */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) {
        //no sentence annotations
        //process as a single section
        sentences.add(at);
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        //three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());
        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext(); ) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            //create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            //add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            //TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList)

Example 3 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class OpenNlpPosTaggingEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, true);
    POSTagger posTagger = getPOSTagger(language);
    if (posTagger == null) {
        //the call to canEnhance and computeEnhancement
        throw new EngineException("PosTagger for langauge '" + language + "is not available.");
    }
    TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
        log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        //for now only created to avoid checks for tagSet == null
        //TODO: in future we might want to automatically create posModels based
        //on tagged texts. However this makes no sense as long we can not
        //persist TagSets.
        tagSet = new TagSet<PosTag>("dummy", language);
    }
    //holds PosTags created for POS tags that where not part of the posModel
    //(will hold all PosTags in case tagSet is NULL
    Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PosTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    //(1) Sentence detection
    //Try to read existing Sentence Annotations
    Iterator<Sentence> sentences = at.getSentences();
    List<Section> sentenceList;
    if (!sentences.hasNext()) {
        //if non try to detect sentences
        log.trace(" > detect sentences for {}", at);
        sentenceList = detectSentences(at, language);
    }
    if (sentences.hasNext()) {
        //check if we have detected sentences
        log.trace(" > use existing Sentence annotations for {}", at);
        sentenceList = new ArrayList<Section>();
        AnalysedTextUtils.appandToList(sentences, sentenceList);
    } else {
        //no sentence detected ... treat the whole text as a single sentence
        //TODO: maybe apply here a limit to the text size!
        log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
        sentenceList = Collections.singletonList((Section) at);
    }
    //for all sentences (or the whole Text - if no sentences available)
    for (Section sentence : sentenceList) {
        //(2) Tokenize Sentences
        List<Token> tokenList;
        //check if there are already tokens
        Iterator<Token> tokens = sentence.getTokens();
        if (!tokens.hasNext()) {
            //no tokens present -> tokenize
            log.trace(" > tokenize {}", sentence);
            tokenList = tokenize(sentence, language);
        } else {
            //use existing
            log.trace(" > use existing Tokens for {}", sentence);
            //ensure an ArrayList is used
            tokenList = new ArrayList<Token>();
            AnalysedTextUtils.appandToList(tokens, tokenList);
        }
        //(3) POS Tagging
        posTag(tokenList, posTagger, tagSet, adhocTags, language);
    }
    if (log.isTraceEnabled()) {
        logAnnotations(at);
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) POSTagger(opennlp.tools.postag.POSTagger) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 4 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class EntityLinker method process.

/**
     * Steps over the sentences, chunks, tokens of the {@link #sentences}
     */
public void process() throws EntitySearcherException {
    long startTime = System.currentTimeMillis();
    //int debugedIndex = 0;
    Section sentence = null;
    textProcessingStats.begin();
    while (state.next()) {
        //   need to react on the state of the Linking process
        if (linkingStateAware != null) {
            if (!state.getSentence().equals(sentence)) {
                if (sentence != null) {
                    linkingStateAware.endSection(sentence);
                }
                //set the next sentence
                sentence = state.getSentence();
                //notify its start
                linkingStateAware.startSection(sentence);
            }
            //notify the current token
            linkingStateAware.startToken(state.getToken().token);
        }
        TokenData token = state.getToken();
        if (log.isDebugEnabled()) {
            log.debug("--- preocess Token {}: {} (lemma: {}) linkable={}, matchable={} | chunk: {}", new Object[] { token.index, token.getTokenText(), token.getTokenLemma(), token.isLinkable, token.isMatchable, token.inChunk != null ? (token.inChunk.chunk + " " + token.inChunk.chunk.getSpan()) : "none" });
        }
        List<TokenData> searchStrings = new ArrayList<TokenData>(linkerConfig.getMaxSearchTokens());
        getSearchString(token);
        searchStrings.add(token);
        //Determine the range we are allowed to search for tokens
        final int minIncludeIndex;
        final int maxIndcludeIndex;
        int consumedIndex = state.getConsumedIndex();
        //NOTE: testing has shown that using Chunks to restrict search for
        //      additional matchable tokens does have an negative impact on
        //      recall. Because of that this restriction is for now deactivated
        //            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks()){
        //                minIncludeIndex = token.inChunk.getStartTokenIndex();
        //                maxIndcludeIndex = token.inChunk.getEndTokenIndex();
        //                log.debug("  - restrict context to chunk[{}, {}]",
        //                    minIncludeIndex, maxIndcludeIndex);
        //            } else {
        maxIndcludeIndex = state.getTokens().size() - 1;
        minIncludeIndex = 0;
        //            }
        int prevIndex = token.index;
        int pastIndex = token.index;
        int pastNonMatchable = 0;
        int prevNonMatchable = 0;
        int distance = 0;
        do {
            //keep track of the distance
            distance++;
            //get the past token at the given distance (However ignore
            //non AlphaNumeric tokens when calculating the distance)
            pastIndex++;
            TokenData pastToken = null;
            while (pastToken == null && maxIndcludeIndex >= pastIndex && pastNonMatchable <= 1) {
                TokenData td = state.getTokens().get(pastIndex);
                if (td.hasAlphaNumeric) {
                    pastToken = td;
                } else {
                    pastIndex++;
                }
            }
            //get the previous token at the given distance (However ignore
            //non AlphaNumeric tokens when calculating the distance)
            prevIndex--;
            TokenData prevToken = null;
            while (prevToken == null && minIncludeIndex <= prevIndex && //the last consumed one
            ((prevIndex > consumedIndex && prevNonMatchable <= 1) || prevIndex <= consumedIndex && prevNonMatchable < 1)) {
                TokenData td = state.getTokens().get(prevIndex);
                if (td.hasAlphaNumeric) {
                    prevToken = td;
                } else {
                    prevIndex--;
                }
            }
            //Fist the past token
            if (pastToken != null) {
                if (log.isDebugEnabled()) {
                    log.debug("    {} {}:'{}' (lemma: {}) linkable={}, matchable={}", new Object[] { pastToken.isMatchable ? '+' : '-', pastToken.index, pastToken.getTokenText(), pastToken.getTokenLemma(), pastToken.isLinkable, pastToken.isMatchable });
                }
                if (pastToken.isMatchable) {
                    searchStrings.add(pastToken);
                } else {
                    pastNonMatchable++;
                }
            }
            //Second in the previous token
            if (prevToken != null) {
                if (log.isDebugEnabled()) {
                    log.debug("    {} {}:'{}' (lemma: {}) linkable={}, matchable={}", new Object[] { prevToken.isMatchable ? '+' : '-', prevToken.index, prevToken.getTokenText(), prevToken.getTokenLemma(), prevToken.isLinkable, prevToken.isMatchable });
                }
                if (prevToken.isMatchable) {
                    getSearchString(prevToken);
                    searchStrings.add(0, prevToken);
                } else {
                    prevNonMatchable++;
                }
            }
        } while (searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance < linkerConfig.getMaxSearchDistance() && (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex) && (prevNonMatchable <= 1 || pastNonMatchable <= 1));
        //we might have an additional element in the list
        if (searchStrings.size() > linkerConfig.getMaxSearchTokens()) {
            searchStrings = //the last part of the list
            searchStrings.subList(searchStrings.size() - linkerConfig.getMaxSearchTokens(), searchStrings.size());
        }
        if (log.isDebugEnabled()) {
            List<String> list = new ArrayList<String>(searchStrings.size());
            for (TokenData dt : searchStrings) {
                list.add(dt.token.getSpan());
            }
            log.debug("  >> searchStrings {}", list);
        }
        textProcessingStats.complete();
        //search for Entities
        List<Suggestion> suggestions = lookupEntities(searchStrings);
        //Treat partial matches that do match more as the best FULL match
        //differently
        List<Suggestion> partialMatches = new ArrayList<Suggestion>();
        if (!suggestions.isEmpty()) {
            rankingStats.begin();
            //update the suggestions based on the best match
            int bestMatchCount = suggestions.get(0).getLabelMatch().getMatchCount();
            Iterator<Suggestion> it = suggestions.iterator();
            while (it.hasNext()) {
                Suggestion suggestion = it.next();
                //suggestions that match less tokens as the best match
                //need to be updated to PARTIAL
                int matchCount = suggestion.getLabelMatch().getMatchCount();
                if (matchCount < bestMatchCount) {
                    suggestion.setMatch(MATCH.PARTIAL);
                } else if (matchCount > bestMatchCount) {
                    //selects more tokens
                    //but only a PARTIAL MATCH
                    partialMatches.add(suggestion);
                    //remove from the main suggestion list
                    it.remove();
                }
                //if matchcount is less than of the best match
                if (matchCount < bestMatchCount && matchCount < linkerConfig.getMinFoundTokens()) {
                    it.remove();
                } else {
                    //calculate the score
                    //how good is the current match in relation to the best one
                    double spanScore = matchCount >= bestMatchCount ? 1.0d : matchCount / (double) bestMatchCount;
                    suggestion.setScore(spanScore * spanScore * suggestion.getLabelMatch().getMatchScore());
                }
            }
            //for debugging
            Suggestion oldBestRanked = suggestions.get(0);
            //resort by score
            Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
            Collections.sort(partialMatches, Suggestion.SCORE_COMPARATOR);
            //after the sort by score!
            if (bestMatchCount != suggestions.get(0).getLabelMatch().getMatchCount()) {
                log.warn("The match count for the top Ranked Suggestion for {} " + "changed after resorting based on Scores!", state.getTokenText(suggestions.get(0).getLabelMatch().getStart(), bestMatchCount));
                log.warn("  originalbest   : {}", oldBestRanked);
                log.warn(" currnet ranking : {}", suggestions);
                log.warn("  ... this will result in worng confidence values relative to the best match");
            }
            int maxSuggestions = linkerConfig.getMaxSuggestions();
            if ((suggestions.size() + 1) > maxSuggestions && linkerConfig.isIncludeSuggestionsWithSimilarScore()) {
                //include suggestions with similar score
                double minIncludeScore = suggestions.get(maxSuggestions).getScore();
                //the next element
                int numInclude = maxSuggestions + 1;
                double actScore;
                do {
                    actScore = suggestions.get(numInclude).getScore();
                    //increase for the next iteration
                    numInclude++;
                } while (numInclude < suggestions.size() && actScore >= minIncludeScore);
                maxSuggestions = numInclude - 1;
            }
            //remove all suggestions > maxSuggestions
            if (suggestions.size() > maxSuggestions) {
                suggestions.subList(maxSuggestions, suggestions.size()).clear();
            }
            //adapt equals rankings based on the entity rank
            if (linkerConfig.isRankEqualScoresBasedOnEntityRankings()) {
                adaptScoresForEntityRankings(suggestions);
                adaptScoresForEntityRankings(partialMatches);
            }
            if (log.isDebugEnabled()) {
                log.debug("  >> Suggestions:");
                int i = 0;
                for (Suggestion s : suggestions) {
                    log.debug("   - {}: {}", i, s);
                    i++;
                }
            }
            //process redirects
            if (linkerConfig.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE) {
                for (Suggestion suggestion : suggestions) {
                    processRedirects(suggestion);
                }
                for (Suggestion suggestion : partialMatches) {
                    processRedirects(suggestion);
                }
            }
            //create LinkedEntities for the main suggestions
            int start = suggestions.get(0).getLabelMatch().getStart();
            int span = suggestions.get(0).getLabelMatch().getSpan();
            //Store the linking results
            String selectedText = state.getTokenText(start, span);
            //float score;
            LinkedEntity linkedEntity = linkedEntities.get(selectedText);
            if (linkedEntity == null) {
                linkedEntity = new LinkedEntity(selectedText, suggestions, getLinkedEntityTypes(suggestions));
                linkedEntities.put(selectedText, linkedEntity);
            }
            // else Assumption: The list of suggestions is the SAME
            linkedEntity.addOccurrence(state.getSentence(), //NOTE: The end Token is "start+span-1"
            state.getTokens().get(start).token, state.getTokens().get(start + span - 1).token);
            //word after the currently found suggestion
            if (suggestions.get(0).getMatch().ordinal() >= MATCH.FULL.ordinal()) {
                state.setConsumed(start + span - 1);
            }
            //      based on those that does select the most tokens.
            if (!partialMatches.isEmpty()) {
                start = partialMatches.get(0).getLabelMatch().getStart();
                span = partialMatches.get(0).getLabelMatch().getSpan();
                selectedText = state.getTokenText(start, span);
                linkedEntity = linkedEntities.get(selectedText);
                if (linkedEntity == null) {
                    linkedEntity = new LinkedEntity(selectedText, partialMatches, getLinkedEntityTypes(suggestions));
                    linkedEntities.put(selectedText, linkedEntity);
                }
                // else Assumption: The list of suggestions is the SAME
                linkedEntity.addOccurrence(state.getSentence(), //NOTE: The end Token is "start+span-1"
                state.getTokens().get(start).token, state.getTokens().get(start + span - 1).token);
            }
            rankingStats.complete();
        }
        // else suggestions are empty
        if (linkingStateAware != null) {
            linkingStateAware.endToken(state.getToken().token);
        }
        textProcessingStats.begin();
    }
    //do not count the last call
    textProcessingStats.cancel();
    if (linkingStateAware != null && sentence != null) {
        linkingStateAware.endSection(sentence);
    }
    this.processingTime = System.currentTimeMillis() - startTime;
}
Also used : ArrayList(java.util.ArrayList) Section(org.apache.stanbol.enhancer.nlp.model.Section)

Example 5 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class PosChunkerEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfiguration, language, true);
    //init the PhraseBuilder
    ChunkFactory chunkFactory = new ChunkFactoryImpl(at, ci.getLock());
    List<PhraseBuilder> phraseBuilders = new ArrayList<PhraseBuilder>(phraseTypeDefinitions.size());
    for (PhraseTypeDefinition ptd : phraseTypeDefinitions) {
        phraseBuilders.add(new PhraseBuilder(ptd, chunkFactory, minPosScore));
    }
    Iterator<? extends Section> sentences = at.getSentences();
    if (!sentences.hasNext()) {
        //no sentences ... iterate over the whole text
        sentences = Collections.singleton(at).iterator();
    }
    while (sentences.hasNext()) {
        // (1) get Tokens and POS information for the sentence
        Section sentence = sentences.next();
        for (PhraseBuilder pb : phraseBuilders) {
            pb.nextSection(sentence);
        }
        Iterator<Token> tokens = sentence.getTokens();
        while (tokens.hasNext()) {
            Token token = tokens.next();
            for (PhraseBuilder pb : phraseBuilders) {
                pb.nextToken(token);
            }
        }
    }
    //signal the end of the document
    for (PhraseBuilder pb : phraseBuilders) {
        pb.nextSection(null);
    }
//        if(log.isTraceEnabled()){
//            logChunks(at);
//        }
}
Also used : NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) ArrayList(java.util.ArrayList) Token(org.apache.stanbol.enhancer.nlp.model.Token) PhraseBuilder(org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder) ChunkFactory(org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder.ChunkFactory) Section(org.apache.stanbol.enhancer.nlp.model.Section) PhraseTypeDefinition(org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition)

Aggregations

Section (org.apache.stanbol.enhancer.nlp.model.Section)8 ArrayList (java.util.ArrayList)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 Token (org.apache.stanbol.enhancer.nlp.model.Token)5 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)2 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)2 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)2 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)2 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)2 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)2 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 ChunkerME (opennlp.tools.chunker.ChunkerME)1 NameFinderME (opennlp.tools.namefind.NameFinderME)1 POSTagger (opennlp.tools.postag.POSTagger)1 SentenceDetector (opennlp.tools.sentdetect.SentenceDetector)1 SimpleTokenizer (opennlp.tools.tokenize.SimpleTokenizer)1 Tokenizer (opennlp.tools.tokenize.Tokenizer)1