use of opennlp.tools.util.Span in project lucida by claritylab.
the class NETagger method addNames.
// ==========
// NE tagging
// ==========
/**
* Adds named entity information to parses.
*
* @param tag named entity type
* @param names spans of tokens that are named entities
* @param tokens parses for the tokens
*/
private static void addNames(String tag, List names, Parse[] tokens) {
for (int i = 0; i < names.size(); i++) {
Span nameTokenSpan = (Span) names.get(i);
Parse startToken = tokens[nameTokenSpan.getStart()];
Parse endToken = tokens[nameTokenSpan.getEnd()];
Parse commonP = startToken.getCommonParent(endToken);
if (commonP != null) {
Span nameSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd());
if (nameSpan.equals(commonP.getSpan())) {
// common parent matches exactly the named entity
commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0));
} else {
// common parent includes the named entity
Parse[] kids = commonP.getChildren();
boolean crossingKids = false;
for (int j = 0; j < kids.length; j++) if (nameSpan.crosses(kids[j].getSpan()))
crossingKids = true;
if (!crossingKids) {
// named entity does not cross children
commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0));
} else {
// NE crosses children
if (commonP.getType().equals("NP")) {
Parse[] grandKids = kids[0].getChildren();
Parse last = grandKids[grandKids.length - 1];
if (grandKids.length > 1 && nameSpan.contains(last.getSpan()))
commonP.insert(new Parse(commonP.getText(), commonP.getSpan(), tag, 1.0));
}
}
}
}
}
}
use of opennlp.tools.util.Span in project elasticsearch-opennlp-plugin by spinscale.
the class SimpleNlpTest method testThatMultipleFindersWork.
@Test
public void testThatMultipleFindersWork() throws Exception {
loadFinders();
Map<String, Set<String>> namedEntities = Maps.newHashMap();
for (int si = 0; si < sentences.length; si++) {
List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
String[] tokens = tokenizer.tokenize(sentences[si]);
for (int fi = 0; fi < finders.length; fi++) {
Span[] spans = finders[fi].find(tokens);
double[] probs = finders[fi].probs(spans);
for (int ni = 0; ni < spans.length; ni++) {
allTextAnnotations.add(new TextAnnotation(names[fi], spans[ni], probs[ni]));
}
}
removeConflicts(allTextAnnotations);
convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
}
assertThat(namedEntities.get("person"), hasSize(3));
assertThat(namedEntities.get("person"), containsInAnyOrder("Nancy Reagan", "Reagan", "Joanne Drake"));
assertThat(namedEntities.get("location"), hasSize(3));
assertThat(namedEntities.get("location"), containsInAnyOrder("Los Angeles", "Santa Monica", "California"));
assertThat(namedEntities.get("date"), hasSize(1));
assertThat(namedEntities.get("date"), containsInAnyOrder("Sunday"));
}
use of opennlp.tools.util.Span in project textdb by TextDB.
the class NameFinderExample method main.
public static void main(String[] args) throws IOException {
String dataFile = "./src/main/resources/abstract_100.txt";
Scanner scan = new Scanner(new File(dataFile));
InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-ner-location.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
int counter = 0;
PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
perfMon.start();
while (scan.hasNextLine()) {
String[] sentence = Tokenize(scan.nextLine());
Span[] spans = nameFinder.find(sentence);
perfMon.incrementCounter();
//Print out the tokens of the sentence
if (spans.length != 0) {
for (String s : sentence) {
System.out.print("[" + s + "] ");
}
System.out.println("/n");
}
//Print out the offset of each
for (Span s : spans) {
System.out.println(s.toString());
for (int i = s.getStart(); i < s.getEnd(); i++) {
System.out.println(sentence[i]);
counter++;
}
}
if (spans.length != 0)
System.out.println();
}
perfMon.stopAndPrintFinalResult();
System.out.println("Number of Results: " + counter);
scan.close();
}
use of opennlp.tools.util.Span in project stanbol by apache.
the class PosTypeChunker method chunkAsSpans.
/**
* Build the chunks based on the parsed tokens and the one or more detected
* POS tags alternatives for the tokens. <p>
* @param tokens the tokens
* @param tags the POS tags for the tokens (1D:tokens; 2D:POS tags)
* @return the chunks as spans over the parsed tokens
*/
public Span[] chunkAsSpans(String[] tokens, String[][] tags, double[][] props) {
//NOTE: this is a 1:1 copy of the above method!! However this is the
// only solution, because merging them into a single one would
// need to copy the Stirng[] of the other into a String[][1] as
// used by this one :(
// If someone has a better Idea feel free to change!
// Rupert Westenthaler (28.Sep.2011)
int consumed = -1;
List<Span> chunks = new ArrayList<Span>();
for (int i = 0; i < tokens.length; i++) {
if (includePOS(props[i], tags[i])) {
int start = i;
//do not follow backwards!
while (start - 1 > consumed && followPOS(props[start - 1], tags[start - 1])) {
//follow backwards until consumed
start--;
}
int followEnd = i;
int end = i;
while (followEnd + 1 < tokens.length && followPOS(props[followEnd + 1], tags[followEnd + 1])) {
//follow
followEnd++;
if (includePOS(props[followEnd], tags[followEnd])) {
//extend end only if act is include
end = followEnd;
}
}
chunks.add(new Span(start, end));
// consumed = end;
i = followEnd;
}
//build no chunk for this token
}
return chunks.toArray(new Span[chunks.size()]);
}
use of opennlp.tools.util.Span in project stanbol by apache.
the class KeywordTokenizer method tokenizePos.
public Span[] tokenizePos(String s) {
boolean isWhitespace;
List<Span> tokens = new ArrayList<Span>();
int sl = s.length();
int start = -1;
char pc = 0;
for (int ci = 0; ci <= sl; ci++) {
char c = ci < sl ? s.charAt(ci) : ' ';
isWhitespace = StringUtil.isWhitespace(c);
if (!isWhitespace & start < 0) {
// new token starts
start = ci;
}
if (isWhitespace && start >= 0) {
// limited support for punctations at the end of words
if (start < ci - 1 && (pc == '.' || pc == ',' || pc == '!' || pc == '?' || pc == ';' || pc == ':')) {
tokens.add(new Span(start, ci - 1));
tokens.add(new Span(ci - 1, ci));
} else {
tokens.add(new Span(start, ci));
}
start = -1;
}
}
return tokens.toArray(new Span[tokens.size()]);
}
Aggregations