use of opennlp.tools.namefind.NameFinderME in project stanbol by apache.
the class NEREngineCore method extractNameOccurrences.
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
text = removeNonUtf8CompliantCharacters(text);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
Tokenizer tokenizer = openNLP.getTokenizer(language);
Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
if (i > 0) {
CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
contextElements.add(previousSentence.toString().trim());
}
contextElements.add(sentence.trim());
if (i + 1 < sentenceSpans.length) {
CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
contextElements.add(nextSentence.toString().trim());
}
String context = StringUtils.join(contextElements, " ");
// extract the names in the current sentence and
// keep them store them with the current context
Span[] tokenSpans = tokenizer.tokenizePos(sentence);
String[] tokens = Span.spansToStrings(tokenSpans, sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
// int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
// NOTE: With OpenNLP 1.6 the probability is now stored in the span
double prob = nameSpans[j].getProb();
// prob == 0.0 := unspecified
Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
if (confidence == null) {
// fall back to the old if it is not set.
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
prob *= probs[k];
}
confidence = Double.valueOf(prob);
} else if (confidence < 0.5d) {
// It looks like as if preceptron based models do return
// invalid probabilities. As it is expected the Named Entities
// with a probability < 50% are not even returned by finder.find(..)
// we will just ignore confidence values < 0.5 here
confidence = null;
}
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
use of opennlp.tools.namefind.NameFinderME in project stanbol by apache.
the class NEREngineCore method extractNameOccurrences.
/**
* THis method extracts NamedEntity occurrences by using existing {@link Token}s and
* {@link Sentence}s in the parsed {@link AnalysedText}.
* @param nameFinderModel the model used to find NamedEntities
* @param at the Analysed Text
* @param language the language of the text
* @return the found named Entity Occurrences
*/
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
NameFinderME finder = new NameFinderME(nameFinderModel);
Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
List<Section> sentences = new ArrayList<Section>();
// Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
AnalysedTextUtils.appandToList(at.getSentences(), sentences);
if (sentences.isEmpty()) {
// no sentence annotations
// process as a single section
sentences.add(at);
}
for (int i = 0; i < sentences.size(); i++) {
String sentence = sentences.get(i).getSpan();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
contextElements.add(sentence);
// three sentences as context
String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());
// get the tokens, words of the current sentence
List<Token> tokens = new ArrayList<Token>(32);
List<String> words = new ArrayList<String>(32);
for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext(); ) {
Token t = it.next();
tokens.add(t);
words.add(t.getSpan());
}
Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
double[] probs = finder.probs();
// int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd());
Double confidence = 1.0;
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
}
int start = tokens.get(nameSpans[j].getStart()).getStart();
int end = start + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
// create the occurrence for writing fise:TextAnnotations
NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
// add also the NerAnnotation to the AnalysedText
Chunk chunk = at.addChunk(start, end);
// TODO: build AnnotationModel based on the configured Mappings
chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
use of opennlp.tools.namefind.NameFinderME in project textdb by TextDB.
the class NameFinderExample method main.
public static void main(String[] args) throws IOException {
String dataFile = "./src/main/resources/abstract_100.txt";
Scanner scan = new Scanner(new File(dataFile));
InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/texera/sandbox/OpenNLPexample/en-ner-location.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
int counter = 0;
PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
perfMon.start();
while (scan.hasNextLine()) {
String[] sentence = Tokenize(scan.nextLine());
Span[] spans = nameFinder.find(sentence);
perfMon.incrementCounter();
// Print out the tokens of the sentence
if (spans.length != 0) {
for (String s : sentence) {
System.out.print("[" + s + "] ");
}
System.out.println("/n");
}
// Print out the offset of each
for (Span s : spans) {
System.out.println(s.toString());
for (int i = s.getStart(); i < s.getEnd(); i++) {
System.out.println(sentence[i]);
counter++;
}
}
if (spans.length != 0)
System.out.println();
}
perfMon.stopAndPrintFinalResult();
System.out.println("Number of Results: " + counter);
scan.close();
}
use of opennlp.tools.namefind.NameFinderME in project elasticsearch-opennlp-plugin by spinscale.
the class OpenNlpService method tokenize.
public Map<String, Set<String>> tokenize(String content) {
Map<String, Set<String>> namedEntities = Maps.newHashMap();
List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
for (Map.Entry<String, TokenNameFinderModel> finderEntry : finders.entrySet()) {
String type = finderEntry.getKey();
NameFinderME finder = new NameFinderME(finderEntry.getValue());
Span[] spans = finder.find(tokens);
double[] probs = finder.probs(spans);
for (int ni = 0; ni < spans.length; ni++) {
allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
}
}
if (allTextAnnotations.size() > 0) {
removeConflicts(allTextAnnotations);
}
convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
return namedEntities;
}
use of opennlp.tools.namefind.NameFinderME in project elasticsearch-opennlp-plugin by spinscale.
the class SimpleNlpTest method loadFinders.
public void loadFinders() throws Exception {
finders = new NameFinderME[names.length];
StopWatch sw = new StopWatch("Loading models").start();
for (int mi = 0; mi < names.length; mi++) {
finders[mi] = new NameFinderME(new PooledTokenNameFinderModel(new FileInputStream(new File("src/test/resources/models", "en-ner-" + names[mi] + ".bin"))));
}
sw.stop();
}
Aggregations