Search in sources :

Example 1 with TokenNameFinderModel

use of opennlp.tools.namefind.TokenNameFinderModel in project elasticsearch-opennlp-plugin by spinscale.

the class OpenNlpService method tokenize.

public Map<String, Set<String>> tokenize(String content) {
    Map<String, Set<String>> namedEntities = Maps.newHashMap();
    List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
    String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
    for (Map.Entry<String, TokenNameFinderModel> finderEntry : finders.entrySet()) {
        String type = finderEntry.getKey();
        NameFinderME finder = new NameFinderME(finderEntry.getValue());
        Span[] spans = finder.find(tokens);
        double[] probs = finder.probs(spans);
        for (int ni = 0; ni < spans.length; ni++) {
            allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
        }
    }
    if (allTextAnnotations.size() > 0) {
        removeConflicts(allTextAnnotations);
    }
    convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
    return namedEntities;
}
Also used : TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) PooledTokenNameFinderModel(org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel) Span(opennlp.tools.util.Span) NameFinderME(opennlp.tools.namefind.NameFinderME) TextAnnotation(org.elasticsearch.service.opennlp.models.TextAnnotation)

Example 2 with TokenNameFinderModel

use of opennlp.tools.namefind.TokenNameFinderModel in project stanbol by apache.

the class NEREngineCore method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    //first check the langauge before processing the content (text)
    String language = extractLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    if (!isNerModel(language)) {
        throw new IllegalStateException("For the language '" + language + "' of ContentItem " + ci.getUri() + " no NER model is configured: This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
    //validate data in the AnalysedText
    final String text;
    if (at != null && at.getTokens().hasNext()) {
        //if the AnalysedText is present and tokens are present
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
        }
        text = null;
    } else {
        //no AnalysedText with tokens ...
        //fallback to processing the plain text is still supported
        Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
        if (contentPart == null) {
            throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
        }
        try {
            text = ContentItemHelper.getText(contentPart.getValue());
        } catch (IOException e) {
            throw new InvalidContentException(this, ci, e);
        }
        if (text.trim().length() == 0) {
            // TODO: make the length of the data a field of the ContentItem
            // interface to be able to filter out empty items in the canEnhance
            // method
            log.warn("ContentPart {} of ContentItem {} does not contain any text" + "to extract knowledge from in ContentItem {}", contentPart.getKey(), ci);
            return;
        }
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}", new Object[] { contentPart.getKey(), ci.getUri().getUnicodeString(), StringUtils.abbreviate(text, 100) });
        }
    }
    try {
        if (config.isProcessedLangage(language)) {
            for (String defaultModelType : config.getDefaultModelTypes()) {
                TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
                if (nameFinderModel == null) {
                    log.info("No NER Model for {} and language {} available!", defaultModelType, language);
                } else {
                    findNamedEntities(ci, at, text, language, nameFinderModel);
                }
            }
        }
        //process for additional models
        for (String additionalModel : config.getSpecificNerModles(language)) {
            TokenNameFinderModel nameFinderModel;
            try {
                nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, additionalModel, null);
                findNamedEntities(ci, at, text, language, nameFinderModel);
            } catch (IOException e) {
                log.warn("Unable to load TokenNameFinderModel model for language '" + language + "' (model: " + additionalModel + ")", e);
            } catch (RuntimeException e) {
                log.warn("Error while creating ChunkerModel for language '" + language + "' (model: " + additionalModel + ")", e);
            }
        }
    } catch (Exception e) {
        if (e instanceof RuntimeException) {
            throw (RuntimeException) e;
        } else {
            throw new EngineException(this, ci, e);
        }
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) IRI(org.apache.clerezza.commons.rdf.IRI) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) InvalidFormatException(opennlp.tools.util.InvalidFormatException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) IOException(java.io.IOException)

Example 3 with TokenNameFinderModel

use of opennlp.tools.namefind.TokenNameFinderModel in project stanbol by apache.

the class OpenNLPTest method testLoadModelByName.

@Test
public void testLoadModelByName() throws IOException {
    TokenizerModel tokenModel = openNLP.getModel(TokenizerModel.class, "en-token.bin", null);
    Assert.assertNotNull(tokenModel);
    SentenceModel sentModel = openNLP.getModel(SentenceModel.class, "en-sent.bin", null);
    Assert.assertNotNull(sentModel);
    POSModel posModel = openNLP.getModel(POSModel.class, "en-pos-maxent.bin", null);
    Assert.assertNotNull(posModel);
    ChunkerModel chunkModel = openNLP.getModel(ChunkerModel.class, "en-chunker.bin", null);
    Assert.assertNotNull(chunkModel);
    TokenNameFinderModel nerModel = openNLP.getModel(TokenNameFinderModel.class, "en-ner-person.bin", null);
    Assert.assertNotNull(nerModel);
    //unavailable model
    tokenModel = openNLP.getModel(TokenizerModel.class, "ru-token.bin", null);
    Assert.assertNull(tokenModel);
}
Also used : TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) ChunkerModel(opennlp.tools.chunker.ChunkerModel) SentenceModel(opennlp.tools.sentdetect.SentenceModel) POSModel(opennlp.tools.postag.POSModel) TokenizerModel(opennlp.tools.tokenize.TokenizerModel) Test(org.junit.Test)

Example 4 with TokenNameFinderModel

use of opennlp.tools.namefind.TokenNameFinderModel in project stanbol by apache.

the class OpenNLPTest method testLoadMissingNER.

@Test
public void testLoadMissingNER() throws IOException {
    //first unknown type
    TokenNameFinderModel model = openNLP.getNameModel("person2", "en");
    Assert.assertNull(model);
    TokenNameFinder ner = openNLP.getNameFinder("person2", "en");
    Assert.assertNull(ner);
    //unknown language
    model = openNLP.getNameModel("person", "ru");
    Assert.assertNull(model);
    ner = openNLP.getNameFinder("person", "ru");
    Assert.assertNull(ner);
}
Also used : TokenNameFinder(opennlp.tools.namefind.TokenNameFinder) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) Test(org.junit.Test)

Example 5 with TokenNameFinderModel

use of opennlp.tools.namefind.TokenNameFinderModel in project textdb by TextDB.

the class NameFinderExample method main.

public static void main(String[] args) throws IOException {
    String dataFile = "./src/main/resources/abstract_100.txt";
    Scanner scan = new Scanner(new File(dataFile));
    InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-ner-location.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(is);
    is.close();
    NameFinderME nameFinder = new NameFinderME(model);
    int counter = 0;
    PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
    perfMon.start();
    while (scan.hasNextLine()) {
        String[] sentence = Tokenize(scan.nextLine());
        Span[] spans = nameFinder.find(sentence);
        perfMon.incrementCounter();
        //Print out the tokens of the sentence
        if (spans.length != 0) {
            for (String s : sentence) {
                System.out.print("[" + s + "] ");
            }
            System.out.println("/n");
        }
        //Print out the offset of each 
        for (Span s : spans) {
            System.out.println(s.toString());
            for (int i = s.getStart(); i < s.getEnd(); i++) {
                System.out.println(sentence[i]);
                counter++;
            }
        }
        if (spans.length != 0)
            System.out.println();
    }
    perfMon.stopAndPrintFinalResult();
    System.out.println("Number of Results: " + counter);
    scan.close();
}
Also used : Scanner(java.util.Scanner) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) NameFinderME(opennlp.tools.namefind.NameFinderME) PerformanceMonitor(opennlp.tools.cmdline.PerformanceMonitor) File(java.io.File) Span(opennlp.tools.util.Span) FileInputStream(java.io.FileInputStream)

Aggregations

TokenNameFinderModel (opennlp.tools.namefind.TokenNameFinderModel)7 NameFinderME (opennlp.tools.namefind.NameFinderME)3 Test (org.junit.Test)3 IOException (java.io.IOException)2 TokenNameFinder (opennlp.tools.namefind.TokenNameFinder)2 Span (opennlp.tools.util.Span)2 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStream (java.io.InputStream)1 URISyntaxException (java.net.URISyntaxException)1 Scanner (java.util.Scanner)1 ChunkerModel (opennlp.tools.chunker.ChunkerModel)1 PerformanceMonitor (opennlp.tools.cmdline.PerformanceMonitor)1 POSModel (opennlp.tools.postag.POSModel)1 SentenceModel (opennlp.tools.sentdetect.SentenceModel)1 TokenizerModel (opennlp.tools.tokenize.TokenizerModel)1 InvalidFormatException (opennlp.tools.util.InvalidFormatException)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1