Search in sources :

Example 1 with SentenceDetectorME

use of opennlp.tools.sentdetect.SentenceDetectorME in project stanbol by apache.

the class TextAnalyzer method getSentenceDetector.

protected final SentenceDetector getSentenceDetector() {
    if (!config.enableSentenceDetector) {
        return null;
    }
    if (sentenceDetector == null && !sentenceDetectorNotAvailable) {
        try {
            SentenceModel sentModel = openNLP.getSentenceModel(language);
            if (sentModel != null) {
                sentenceDetector = new SentenceDetectorME(sentModel);
            } else {
                log.debug("No Sentence Detection Model for language '{}'", language);
                sentenceDetectorNotAvailable = true;
            }
        } catch (IOException e) {
            log.info("Unable to load Sentence Detection Model for language '" + language + "'", e);
            sentenceDetectorNotAvailable = true;
        }
    }
    return sentenceDetector;
}
Also used : SentenceModel(opennlp.tools.sentdetect.SentenceModel) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME) IOException(java.io.IOException)

Example 2 with SentenceDetectorME

use of opennlp.tools.sentdetect.SentenceDetectorME in project stanbol by apache.

the class NEREngineCore method extractNameOccurrences.

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);
    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");
        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        // int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            // NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            // prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) {
                // fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                // It looks like as if preceptron based models do return
                // invalid probabilities. As it is expected the Named Entities
                // with a probability < 50% are not even returned by finder.find(..)
                // we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) ArrayList(java.util.ArrayList) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME) Span(opennlp.tools.util.Span) LinkedHashMap(java.util.LinkedHashMap) NameFinderME(opennlp.tools.namefind.NameFinderME) List(java.util.List) ArrayList(java.util.ArrayList) Tokenizer(opennlp.tools.tokenize.Tokenizer)

Example 3 with SentenceDetectorME

use of opennlp.tools.sentdetect.SentenceDetectorME in project stanbol by apache.

the class OpenNlpSentenceDetectionEngine method getSentenceDetector.

/**
 * Obtains the {@link SentenceDetectorME} model for the given
 * language form the {@link #openNLP} service. If a custom
 * model is configured for the parsed language than it is
 * loaded by using {@link OpenNLP#getModel(Class, String, Map)}
 * otherwise the default model {@link OpenNLP#getSentenceDetector(String)}
 * is retrieved
 * @param language the language
 * @return the model of <code>null</code> if non is available or
 * an exception was encountered while loading
 */
private SentenceDetector getSentenceDetector(String language) {
    SentenceModel model;
    String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
    if (modelName == null) {
        try {
            model = openNLP.getSentenceModel(language);
        } catch (Exception e) {
            log.warn("Unable to load default Sentence Detection model for language '" + language + "'!", e);
            return null;
        }
    } else {
        try {
            model = openNLP.getModel(SentenceModel.class, modelName, null);
        } catch (Exception e) {
            log.warn("Unable to load Sentence Detection model for language '" + language + "' from the configured model '" + modelName + "'!", e);
            return null;
        }
    }
    if (model != null) {
        log.debug("Sentence Detection Model {} for lanugage '{}' version: {}", new Object[] { model.getClass().getSimpleName(), model.getLanguage(), model.getVersion() != null ? model.getVersion() : "undefined" });
        return new SentenceDetectorME(model);
    }
    log.debug("Sentence Detection Model for Language '{}' not available.", language);
    return null;
}
Also used : SentenceModel(opennlp.tools.sentdetect.SentenceModel) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ConfigurationException(org.osgi.service.cm.ConfigurationException)

Example 4 with SentenceDetectorME

use of opennlp.tools.sentdetect.SentenceDetectorME in project epadd by ePADD.

the class NER method initialize.

public static synchronized void initialize() throws ClassCastException, IOException, ClassNotFoundException {
    if (pFinder != null)
        return;
    long startTimeMillis = System.currentTimeMillis();
    log.info("Initializing NER models");
    try {
        InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        pFinder = new NameFinderME(pmodel);
        InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        lFinder = new NameFinderME(lmodel);
        InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        oFinder = new NameFinderME(omodel);
    }// dont bother about this, instead try not to use it
     catch (Exception e) {
        Util.print_exception(e, log);
    }
    try {
        InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin");
        SentenceModel model = new SentenceModel(modelIn);
        sFinder = new SentenceDetectorME(model);
        InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        tokenizer = new TokenizerME(modelTokenizer);
    } catch (Exception e) {
        Util.print_exception(e);
    }
    long endTimeMillis = System.currentTimeMillis();
    log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
}
Also used : TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) GZIPInputStream(java.util.zip.GZIPInputStream) SentenceModel(opennlp.tools.sentdetect.SentenceModel) NameFinderME(opennlp.tools.namefind.NameFinderME) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME) TokenizerME(opennlp.tools.tokenize.TokenizerME) TokenizerModel(opennlp.tools.tokenize.TokenizerModel)

Example 5 with SentenceDetectorME

use of opennlp.tools.sentdetect.SentenceDetectorME in project useful-java-links by Vedenin.

the class OpenNLPSentenceDetectors method testOpenNLP.

private String[] testOpenNLP(String text) throws Exception {
    try (InputStream modelIn = this.getClass().getResourceAsStream(RESOURCES_EN_SENT_BIN)) {
        SentenceModel model = new SentenceModel(modelIn);
        SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
        return sentenceDetector.sentDetect(text);
    }
}
Also used : InputStream(java.io.InputStream) SentenceModel(opennlp.tools.sentdetect.SentenceModel) SentenceDetectorME(opennlp.tools.sentdetect.SentenceDetectorME)

Aggregations

SentenceDetectorME (opennlp.tools.sentdetect.SentenceDetectorME)6 SentenceModel (opennlp.tools.sentdetect.SentenceModel)5 InputStream (java.io.InputStream)2 NameFinderME (opennlp.tools.namefind.NameFinderME)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 TokenNameFinderModel (opennlp.tools.namefind.TokenNameFinderModel)1 Tokenizer (opennlp.tools.tokenize.Tokenizer)1 TokenizerME (opennlp.tools.tokenize.TokenizerME)1 TokenizerModel (opennlp.tools.tokenize.TokenizerModel)1 Span (opennlp.tools.util.Span)1 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1 ConfigurationException (org.osgi.service.cm.ConfigurationException)1