Search in sources :

Example 1 with Span

use of opennlp.tools.util.Span in project lucida by claritylab.

the class NETagger method tagNes.

/**
	 * Performs named entity tagging on an array of full parses of sentences.
	 * 
	 * @param parses array of full parses of sentences
	 */
// TODO only works with OpenNLP taggers so far
@SuppressWarnings("unchecked")
public static void tagNes(Parse[] parses) {
    String[] results = new String[parses.length];
    for (int s = 0; s < results.length; s++) results[s] = "";
    // initialize prevTokenMaps
    Map[] prevTokenMaps = new HashMap[finders.length];
    for (int i = 0; i < finders.length; i++) prevTokenMaps[i] = new HashMap();
    for (Parse parse : parses) {
        // get tokens
        Parse[] tokens = parse.getTagNodes();
        // find named entites
        String[][] finderTags = new String[finders.length][];
        for (int i = 0; i < finders.length; i++) finderTags[i] = finders[i].find(tokens, prevTokenMaps[i]);
        // update prevTokenMaps
        for (int i = 0; i < prevTokenMaps.length; i++) for (int j = 0; j < tokens.length; j++) prevTokenMaps[i].put(tokens[j], finderTags[i][j]);
        for (int i = 0; i < finders.length; i++) {
            int start = -1;
            List<Span> names = new ArrayList<Span>(5);
            // determine spans of tokens that are named entities
            for (int j = 0; j < tokens.length; j++) {
                if ((finderTags[i][j].equals(NameFinderME.START) || finderTags[i][j].equals(NameFinderME.OTHER))) {
                    if (start != -1)
                        names.add(new Span(start, j - 1));
                    start = -1;
                }
                if (finderTags[i][j].equals(NameFinderME.START))
                    start = j;
            }
            if (start != -1)
                names.add(new Span(start, tokens.length - 1));
            // add name entity information to parse
            addNames(finderNames[i], names, tokens);
        }
    }
}
Also used : HashMap(java.util.HashMap) Parse(opennlp.tools.parser.Parse) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) Span(opennlp.tools.util.Span)

Example 2 with Span

use of opennlp.tools.util.Span in project elasticsearch-opennlp-plugin by spinscale.

the class OpenNlpService method tokenize.

public Map<String, Set<String>> tokenize(String content) {
    Map<String, Set<String>> namedEntities = Maps.newHashMap();
    List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
    String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
    for (Map.Entry<String, TokenNameFinderModel> finderEntry : finders.entrySet()) {
        String type = finderEntry.getKey();
        NameFinderME finder = new NameFinderME(finderEntry.getValue());
        Span[] spans = finder.find(tokens);
        double[] probs = finder.probs(spans);
        for (int ni = 0; ni < spans.length; ni++) {
            allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
        }
    }
    if (allTextAnnotations.size() > 0) {
        removeConflicts(allTextAnnotations);
    }
    convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
    return namedEntities;
}
Also used : TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) PooledTokenNameFinderModel(org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel) Span(opennlp.tools.util.Span) NameFinderME(opennlp.tools.namefind.NameFinderME) TextAnnotation(org.elasticsearch.service.opennlp.models.TextAnnotation)

Example 3 with Span

use of opennlp.tools.util.Span in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

/**
     * THis method extracts NamedEntity occurrences by using existing {@link Token}s and 
     * {@link Sentence}s in the parsed {@link AnalysedText}.
     * @param nameFinderModel the model used to find NamedEntities
     * @param at the Analysed Text
     * @param language the language of the text
     * @return the found named Entity Occurrences
     */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) {
        //no sentence annotations
        //process as a single section
        sentences.add(at);
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        //three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());
        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext(); ) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            //create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            //add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            //TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList)

Example 4 with Span

use of opennlp.tools.util.Span in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);
    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");
        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) {
                //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList) Tokenizer(opennlp.tools.tokenize.Tokenizer)

Example 5 with Span

use of opennlp.tools.util.Span in project stanbol by apache.

the class PosTypeChunker method chunkAsSpans.

/**
     * Build the chunks based on the parsed tokens and POS tags. <p>
     * This method is the equivalent to 
     * {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])}
     * @param tokens the tokens
     * @param tags the POS tags for the tokens
     * @return the chunks as spans over the parsed tokens
     */
public Span[] chunkAsSpans(String[] tokens, String[] tags) {
    int consumed = -1;
    List<Span> chunks = new ArrayList<Span>();
    for (int i = 0; i < tokens.length; i++) {
        if (includePOS(null, tags[i])) {
            int start = i;
            while (start - 1 > consumed && followPOS(null, tags[start - 1])) {
                //follow backwards until consumed
                start--;
            }
            int followEnd = i;
            int end = i;
            while (followEnd + 1 < tokens.length && followPOS(null, tags[followEnd + 1])) {
                //follow
                followEnd++;
                if (includePOS(null, tags[followEnd])) {
                    //extend end only if act is include
                    end = followEnd;
                }
            }
            chunks.add(new Span(start, end));
            //                consumed = end;
            i = followEnd;
        }
    //build no chunk for this token
    }
    return chunks.toArray(new Span[chunks.size()]);
}
Also used : ArrayList(java.util.ArrayList) Span(opennlp.tools.util.Span)

Aggregations

Span (opennlp.tools.util.Span)10 ArrayList (java.util.ArrayList)6 NameFinderME (opennlp.tools.namefind.NameFinderME)4 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 TokenNameFinderModel (opennlp.tools.namefind.TokenNameFinderModel)2 Parse (opennlp.tools.parser.Parse)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 TextAnnotation (org.elasticsearch.service.opennlp.models.TextAnnotation)2 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStream (java.io.InputStream)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Scanner (java.util.Scanner)1 PerformanceMonitor (opennlp.tools.cmdline.PerformanceMonitor)1 SentenceDetectorME (opennlp.tools.sentdetect.SentenceDetectorME)1 Tokenizer (opennlp.tools.tokenize.Tokenizer)1 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)1 Section (org.apache.stanbol.enhancer.nlp.model.Section)1