Search in sources :

Example 1 with NameFinderME

use of opennlp.tools.namefind.NameFinderME in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);
    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");
        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        // int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            // NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            // prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) {
                // fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                // It looks like as if preceptron based models do return
                // invalid probabilities. As it is expected the Named Entities
                // with a probability < 50% are not even returned by finder.find(..)
                // we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList) Tokenizer(opennlp.tools.tokenize.Tokenizer)

Example 2 with NameFinderME

use of opennlp.tools.namefind.NameFinderME in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

/**
 * THis method extracts NamedEntity occurrences by using existing {@link Token}s and
 * {@link Sentence}s in the parsed {@link AnalysedText}.
 * @param nameFinderModel the model used to find NamedEntities
 * @param at the Analysed Text
 * @param language the language of the text
 * @return the found named Entity Occurrences
 */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    // Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) {
        // no sentence annotations
        // process as a single section
        sentences.add(at);
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        // three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());
        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext(); ) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        // int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            // create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            // add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            // TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList)

Example 3 with NameFinderME

use of opennlp.tools.namefind.NameFinderME in project textdb by TextDB.

the class NameFinderExample method main.

public static void main(String[] args) throws IOException {
    String dataFile = "./src/main/resources/abstract_100.txt";
    Scanner scan = new Scanner(new File(dataFile));
    InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/texera/sandbox/OpenNLPexample/en-ner-location.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(is);
    is.close();
    NameFinderME nameFinder = new NameFinderME(model);
    int counter = 0;
    PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
    perfMon.start();
    while (scan.hasNextLine()) {
        String[] sentence = Tokenize(scan.nextLine());
        Span[] spans = nameFinder.find(sentence);
        perfMon.incrementCounter();
        // Print out the tokens of the sentence
        if (spans.length != 0) {
            for (String s : sentence) {
                System.out.print("[" + s + "] ");
            }
            System.out.println("/n");
        }
        // Print out the offset of each
        for (Span s : spans) {
            System.out.println(s.toString());
            for (int i = s.getStart(); i < s.getEnd(); i++) {
                System.out.println(sentence[i]);
                counter++;
            }
        }
        if (spans.length != 0)
            System.out.println();
    }
    perfMon.stopAndPrintFinalResult();
    System.out.println("Number of Results: " + counter);
    scan.close();
}
Also used : Scanner(java.util.Scanner) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) NameFinderME(opennlp.tools.namefind.NameFinderME) PerformanceMonitor(opennlp.tools.cmdline.PerformanceMonitor) File(java.io.File) Span(opennlp.tools.util.Span) FileInputStream(java.io.FileInputStream)

Example 4 with NameFinderME

use of opennlp.tools.namefind.NameFinderME in project elasticsearch-opennlp-plugin by spinscale.

the class OpenNlpService method tokenize.

public Map<String, Set<String>> tokenize(String content) {
    Map<String, Set<String>> namedEntities = Maps.newHashMap();
    List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
    String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
    for (Map.Entry<String, TokenNameFinderModel> finderEntry : finders.entrySet()) {
        String type = finderEntry.getKey();
        NameFinderME finder = new NameFinderME(finderEntry.getValue());
        Span[] spans = finder.find(tokens);
        double[] probs = finder.probs(spans);
        for (int ni = 0; ni < spans.length; ni++) {
            allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
        }
    }
    if (allTextAnnotations.size() > 0) {
        removeConflicts(allTextAnnotations);
    }
    convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
    return namedEntities;
}
Also used : TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) PooledTokenNameFinderModel(org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel) Span(opennlp.tools.util.Span) NameFinderME(opennlp.tools.namefind.NameFinderME) TextAnnotation(org.elasticsearch.service.opennlp.models.TextAnnotation)

Example 5 with NameFinderME

use of opennlp.tools.namefind.NameFinderME in project elasticsearch-opennlp-plugin by spinscale.

the class SimpleNlpTest method loadFinders.

public void loadFinders() throws Exception {
    finders = new NameFinderME[names.length];
    StopWatch sw = new StopWatch("Loading models").start();
    for (int mi = 0; mi < names.length; mi++) {
        finders[mi] = new NameFinderME(new PooledTokenNameFinderModel(new FileInputStream(new File("src/test/resources/models", "en-ner-" + names[mi] + ".bin"))));
    }
    sw.stop();
}
Also used : NameFinderME(opennlp.tools.namefind.NameFinderME) File(java.io.File) FileInputStream(java.io.FileInputStream) StopWatch(org.elasticsearch.common.StopWatch) PooledTokenNameFinderModel(org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel)

Aggregations

NameFinderME (opennlp.tools.namefind.NameFinderME)9 TokenNameFinderModel (opennlp.tools.namefind.TokenNameFinderModel)6 Span (opennlp.tools.util.Span)6 File (java.io.File)3 FileInputStream (java.io.FileInputStream)3 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 PerformanceMonitor (opennlp.tools.cmdline.PerformanceMonitor)2 SentenceDetectorME (opennlp.tools.sentdetect.SentenceDetectorME)2 TokenizerME (opennlp.tools.tokenize.TokenizerME)2 TokenizerModel (opennlp.tools.tokenize.TokenizerModel)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 PooledTokenNameFinderModel (org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel)2 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 SentenceModel (opennlp.tools.sentdetect.SentenceModel)1