Search in sources :

Example 6 with Span

use of opennlp.tools.util.Span in project lucida by claritylab.

the class NETagger method addNames.

// ==========
// NE tagging
// ==========
/**
	 * Adds named entity information to parses.
	 * 
	 * @param tag named entity type
	 * @param names spans of tokens that are named entities
	 * @param tokens parses for the tokens
	 */
private static void addNames(String tag, List names, Parse[] tokens) {
    for (int i = 0; i < names.size(); i++) {
        Span nameTokenSpan = (Span) names.get(i);
        Parse startToken = tokens[nameTokenSpan.getStart()];
        Parse endToken = tokens[nameTokenSpan.getEnd()];
        Parse commonP = startToken.getCommonParent(endToken);
        if (commonP != null) {
            Span nameSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd());
            if (nameSpan.equals(commonP.getSpan())) {
                // common parent matches exactly the named entity
                commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0));
            } else {
                // common parent includes the named entity
                Parse[] kids = commonP.getChildren();
                boolean crossingKids = false;
                for (int j = 0; j < kids.length; j++) if (nameSpan.crosses(kids[j].getSpan()))
                    crossingKids = true;
                if (!crossingKids) {
                    // named entity does not cross children
                    commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0));
                } else {
                    // NE crosses children
                    if (commonP.getType().equals("NP")) {
                        Parse[] grandKids = kids[0].getChildren();
                        Parse last = grandKids[grandKids.length - 1];
                        if (grandKids.length > 1 && nameSpan.contains(last.getSpan()))
                            commonP.insert(new Parse(commonP.getText(), commonP.getSpan(), tag, 1.0));
                    }
                }
            }
        }
    }
}
Also used : Parse(opennlp.tools.parser.Parse) Span(opennlp.tools.util.Span)

Example 7 with Span

use of opennlp.tools.util.Span in project elasticsearch-opennlp-plugin by spinscale.

the class SimpleNlpTest method testThatMultipleFindersWork.

@Test
public void testThatMultipleFindersWork() throws Exception {
    loadFinders();
    Map<String, Set<String>> namedEntities = Maps.newHashMap();
    for (int si = 0; si < sentences.length; si++) {
        List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
        String[] tokens = tokenizer.tokenize(sentences[si]);
        for (int fi = 0; fi < finders.length; fi++) {
            Span[] spans = finders[fi].find(tokens);
            double[] probs = finders[fi].probs(spans);
            for (int ni = 0; ni < spans.length; ni++) {
                allTextAnnotations.add(new TextAnnotation(names[fi], spans[ni], probs[ni]));
            }
        }
        removeConflicts(allTextAnnotations);
        convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
    }
    assertThat(namedEntities.get("person"), hasSize(3));
    assertThat(namedEntities.get("person"), containsInAnyOrder("Nancy Reagan", "Reagan", "Joanne Drake"));
    assertThat(namedEntities.get("location"), hasSize(3));
    assertThat(namedEntities.get("location"), containsInAnyOrder("Los Angeles", "Santa Monica", "California"));
    assertThat(namedEntities.get("date"), hasSize(1));
    assertThat(namedEntities.get("date"), containsInAnyOrder("Sunday"));
}
Also used : TextAnnotation(org.elasticsearch.service.opennlp.models.TextAnnotation) Span(opennlp.tools.util.Span) Test(org.junit.Test)

Example 8 with Span

use of opennlp.tools.util.Span in project textdb by TextDB.

the class NameFinderExample method main.

public static void main(String[] args) throws IOException {
    String dataFile = "./src/main/resources/abstract_100.txt";
    Scanner scan = new Scanner(new File(dataFile));
    InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-ner-location.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(is);
    is.close();
    NameFinderME nameFinder = new NameFinderME(model);
    int counter = 0;
    PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
    perfMon.start();
    while (scan.hasNextLine()) {
        String[] sentence = Tokenize(scan.nextLine());
        Span[] spans = nameFinder.find(sentence);
        perfMon.incrementCounter();
        //Print out the tokens of the sentence
        if (spans.length != 0) {
            for (String s : sentence) {
                System.out.print("[" + s + "] ");
            }
            System.out.println("/n");
        }
        //Print out the offset of each 
        for (Span s : spans) {
            System.out.println(s.toString());
            for (int i = s.getStart(); i < s.getEnd(); i++) {
                System.out.println(sentence[i]);
                counter++;
            }
        }
        if (spans.length != 0)
            System.out.println();
    }
    perfMon.stopAndPrintFinalResult();
    System.out.println("Number of Results: " + counter);
    scan.close();
}
Also used : Scanner(java.util.Scanner) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) NameFinderME(opennlp.tools.namefind.NameFinderME) PerformanceMonitor(opennlp.tools.cmdline.PerformanceMonitor) File(java.io.File) Span(opennlp.tools.util.Span) FileInputStream(java.io.FileInputStream)

Example 9 with Span

use of opennlp.tools.util.Span in project stanbol by apache.

the class PosTypeChunker method chunkAsSpans.

/**
     * Build the chunks based on the parsed tokens and the one or more detected
     * POS tags alternatives for the tokens. <p>
     * @param tokens the tokens
     * @param tags the POS tags for the tokens (1D:tokens; 2D:POS tags)
     * @return the chunks as spans over the parsed tokens
     */
public Span[] chunkAsSpans(String[] tokens, String[][] tags, double[][] props) {
    //NOTE: this is a 1:1 copy of the above method!! However this is the
    //      only solution, because merging them into a single one would
    //      need to copy the Stirng[] of the other into a String[][1] as
    //      used by this one :(
    //      If someone has a better Idea feel free to change!
    //      Rupert Westenthaler (28.Sep.2011)
    int consumed = -1;
    List<Span> chunks = new ArrayList<Span>();
    for (int i = 0; i < tokens.length; i++) {
        if (includePOS(props[i], tags[i])) {
            int start = i;
            //do not follow backwards!
            while (start - 1 > consumed && followPOS(props[start - 1], tags[start - 1])) {
                //follow backwards until consumed
                start--;
            }
            int followEnd = i;
            int end = i;
            while (followEnd + 1 < tokens.length && followPOS(props[followEnd + 1], tags[followEnd + 1])) {
                //follow
                followEnd++;
                if (includePOS(props[followEnd], tags[followEnd])) {
                    //extend end only if act is include
                    end = followEnd;
                }
            }
            chunks.add(new Span(start, end));
            //                consumed = end;
            i = followEnd;
        }
    //build no chunk for this token
    }
    return chunks.toArray(new Span[chunks.size()]);
}
Also used : ArrayList(java.util.ArrayList) Span(opennlp.tools.util.Span)

Example 10 with Span

use of opennlp.tools.util.Span in project stanbol by apache.

the class KeywordTokenizer method tokenizePos.

public Span[] tokenizePos(String s) {
    boolean isWhitespace;
    List<Span> tokens = new ArrayList<Span>();
    int sl = s.length();
    int start = -1;
    char pc = 0;
    for (int ci = 0; ci <= sl; ci++) {
        char c = ci < sl ? s.charAt(ci) : ' ';
        isWhitespace = StringUtil.isWhitespace(c);
        if (!isWhitespace & start < 0) {
            // new token starts
            start = ci;
        }
        if (isWhitespace && start >= 0) {
            // limited support for punctations at the end of words
            if (start < ci - 1 && (pc == '.' || pc == ',' || pc == '!' || pc == '?' || pc == ';' || pc == ':')) {
                tokens.add(new Span(start, ci - 1));
                tokens.add(new Span(ci - 1, ci));
            } else {
                tokens.add(new Span(start, ci));
            }
            start = -1;
        }
    }
    return tokens.toArray(new Span[tokens.size()]);
}
Also used : ArrayList(java.util.ArrayList) Span(opennlp.tools.util.Span)

Aggregations

Span (opennlp.tools.util.Span)10 ArrayList (java.util.ArrayList)6 NameFinderME (opennlp.tools.namefind.NameFinderME)4 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 TokenNameFinderModel (opennlp.tools.namefind.TokenNameFinderModel)2 Parse (opennlp.tools.parser.Parse)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 TextAnnotation (org.elasticsearch.service.opennlp.models.TextAnnotation)2 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStream (java.io.InputStream)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Scanner (java.util.Scanner)1 PerformanceMonitor (opennlp.tools.cmdline.PerformanceMonitor)1 SentenceDetectorME (opennlp.tools.sentdetect.SentenceDetectorME)1 Tokenizer (opennlp.tools.tokenize.Tokenizer)1 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)1 Section (org.apache.stanbol.enhancer.nlp.model.Section)1