use of opennlp.tools.sentdetect.SentenceDetectorME in project stanbol by apache.
the class TextAnalyzer method getSentenceDetector.
protected final SentenceDetector getSentenceDetector() {
if (!config.enableSentenceDetector) {
return null;
}
if (sentenceDetector == null && !sentenceDetectorNotAvailable) {
try {
SentenceModel sentModel = openNLP.getSentenceModel(language);
if (sentModel != null) {
sentenceDetector = new SentenceDetectorME(sentModel);
} else {
log.debug("No Sentence Detection Model for language '{}'", language);
sentenceDetectorNotAvailable = true;
}
} catch (IOException e) {
log.info("Unable to load Sentence Detection Model for language '" + language + "'", e);
sentenceDetectorNotAvailable = true;
}
}
return sentenceDetector;
}
use of opennlp.tools.sentdetect.SentenceDetectorME in project stanbol by apache.
the class NEREngineCore method extractNameOccurrences.
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
text = removeNonUtf8CompliantCharacters(text);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
Tokenizer tokenizer = openNLP.getTokenizer(language);
Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
if (i > 0) {
CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
contextElements.add(previousSentence.toString().trim());
}
contextElements.add(sentence.trim());
if (i + 1 < sentenceSpans.length) {
CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
contextElements.add(nextSentence.toString().trim());
}
String context = StringUtils.join(contextElements, " ");
// extract the names in the current sentence and
// keep them store them with the current context
Span[] tokenSpans = tokenizer.tokenizePos(sentence);
String[] tokens = Span.spansToStrings(tokenSpans, sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
// int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
// NOTE: With OpenNLP 1.6 the probability is now stored in the span
double prob = nameSpans[j].getProb();
// prob == 0.0 := unspecified
Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
if (confidence == null) {
// fall back to the old if it is not set.
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
prob *= probs[k];
}
confidence = Double.valueOf(prob);
} else if (confidence < 0.5d) {
// It looks like as if preceptron based models do return
// invalid probabilities. As it is expected the Named Entities
// with a probability < 50% are not even returned by finder.find(..)
// we will just ignore confidence values < 0.5 here
confidence = null;
}
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
use of opennlp.tools.sentdetect.SentenceDetectorME in project stanbol by apache.
the class OpenNlpSentenceDetectionEngine method getSentenceDetector.
/**
* Obtains the {@link SentenceDetectorME} model for the given
* language form the {@link #openNLP} service. If a custom
* model is configured for the parsed language than it is
* loaded by using {@link OpenNLP#getModel(Class, String, Map)}
* otherwise the default model {@link OpenNLP#getSentenceDetector(String)}
* is retrieved
* @param language the language
* @return the model of <code>null</code> if non is available or
* an exception was encountered while loading
*/
private SentenceDetector getSentenceDetector(String language) {
SentenceModel model;
String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
if (modelName == null) {
try {
model = openNLP.getSentenceModel(language);
} catch (Exception e) {
log.warn("Unable to load default Sentence Detection model for language '" + language + "'!", e);
return null;
}
} else {
try {
model = openNLP.getModel(SentenceModel.class, modelName, null);
} catch (Exception e) {
log.warn("Unable to load Sentence Detection model for language '" + language + "' from the configured model '" + modelName + "'!", e);
return null;
}
}
if (model != null) {
log.debug("Sentence Detection Model {} for lanugage '{}' version: {}", new Object[] { model.getClass().getSimpleName(), model.getLanguage(), model.getVersion() != null ? model.getVersion() : "undefined" });
return new SentenceDetectorME(model);
}
log.debug("Sentence Detection Model for Language '{}' not available.", language);
return null;
}
use of opennlp.tools.sentdetect.SentenceDetectorME in project epadd by ePADD.
the class NER method initialize.
public static synchronized void initialize() throws ClassCastException, IOException, ClassNotFoundException {
if (pFinder != null)
return;
long startTimeMillis = System.currentTimeMillis();
log.info("Initializing NER models");
try {
InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin");
TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
pFinder = new NameFinderME(pmodel);
InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin");
TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
lFinder = new NameFinderME(lmodel);
InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin");
TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
oFinder = new NameFinderME(omodel);
}// dont bother about this, instead try not to use it
catch (Exception e) {
Util.print_exception(e, log);
}
try {
InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin");
SentenceModel model = new SentenceModel(modelIn);
sFinder = new SentenceDetectorME(model);
InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin");
TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
tokenizer = new TokenizerME(modelTokenizer);
} catch (Exception e) {
Util.print_exception(e);
}
long endTimeMillis = System.currentTimeMillis();
log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
}
use of opennlp.tools.sentdetect.SentenceDetectorME in project useful-java-links by Vedenin.
the class OpenNLPSentenceDetectors method testOpenNLP.
private String[] testOpenNLP(String text) throws Exception {
try (InputStream modelIn = this.getClass().getResourceAsStream(RESOURCES_EN_SENT_BIN)) {
SentenceModel model = new SentenceModel(modelIn);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
return sentenceDetector.sentDetect(text);
}
}
Aggregations