use of opennlp.tools.tokenize.Tokenizer in project textdb by TextDB.
the class POSTagexample method Tokenize.
public static String[] Tokenize(String sentence) throws InvalidFormatException, IOException {
InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-token.bin");
TokenizerModel model = new TokenizerModel(is);
Tokenizer tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize(sentence);
is.close();
return tokens;
}
use of opennlp.tools.tokenize.Tokenizer in project stanbol by apache.
the class NEREngineCore method extractNameOccurrences.
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
text = removeNonUtf8CompliantCharacters(text);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
Tokenizer tokenizer = openNLP.getTokenizer(language);
Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
if (i > 0) {
CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
contextElements.add(previousSentence.toString().trim());
}
contextElements.add(sentence.trim());
if (i + 1 < sentenceSpans.length) {
CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
contextElements.add(nextSentence.toString().trim());
}
String context = StringUtils.join(contextElements, " ");
// extract the names in the current sentence and
// keep them store them with the current context
Span[] tokenSpans = tokenizer.tokenizePos(sentence);
String[] tokens = Span.spansToStrings(tokenSpans, sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
// int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
// NOTE: With OpenNLP 1.6 the probability is now stored in the span
double prob = nameSpans[j].getProb();
// prob == 0.0 := unspecified
Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
if (confidence == null) {
// fall back to the old if it is not set.
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
prob *= probs[k];
}
confidence = Double.valueOf(prob);
} else if (confidence < 0.5d) {
// It looks like as if preceptron based models do return
// invalid probabilities. As it is expected the Named Entities
// with a probability < 50% are not even returned by finder.find(..)
// we will just ignore confidence values < 0.5 here
confidence = null;
}
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
use of opennlp.tools.tokenize.Tokenizer in project stanbol by apache.
the class OpenNlpPosTaggingEngine method tokenize.
private List<Token> tokenize(Section section, String langauge) {
Tokenizer tokenizer = getTokenizer(langauge);
String text = section.getSpan();
// assume avr. token length is 5
List<Token> tokens = new ArrayList<Token>(text.length() / 5);
opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
for (int i = 0; i < tokenSpans.length; i++) {
Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
log.trace(" > add {}", token);
tokens.add(token);
}
return tokens;
}
use of opennlp.tools.tokenize.Tokenizer in project textdb by TextDB.
the class NameFinderExample method Tokenize.
public static String[] Tokenize(String sentence) throws InvalidFormatException, IOException {
InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/texera/sandbox/OpenNLPexample/en-token.bin");
TokenizerModel model = new TokenizerModel(is);
Tokenizer tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize(sentence);
is.close();
return tokens;
}
use of opennlp.tools.tokenize.Tokenizer in project textdb by TextDB.
the class NameFinderExample method Tokenize.
public static String[] Tokenize(String sentence) throws InvalidFormatException, IOException {
InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-token.bin");
TokenizerModel model = new TokenizerModel(is);
Tokenizer tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize(sentence);
is.close();
return tokens;
}
Aggregations