Search in sources :

Example 1 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class BracketFileReader method parseTextWithBrackets.

public static NERDocument parseTextWithBrackets(String annotatedText, String docname) throws Exception {
    if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0)
        return new NERDocument(new ArrayList<LinkedVector>(), docname);
    // can include newlines!!!!
    Vector<String> bracketTokens = new Vector<>();
    Vector<String> bracketTokensTags = new Vector<>();
    parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens);
    StringBuilder buff = new StringBuilder(bracketTokens.size() * 20);
    for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" ");
    // the tokens below will have no newline characters.
    // logger.info("Raw text: "+buff);
    Vector<Vector<String>> parsedTokens = PlainTextReader.sentenceSplitAndTokenizeText(buff.toString());
    // now we need to align the bracket tokens to the sentence split and tokenized tokens.
    // there are two issues to be careful with -
    // 1) The bracket tokens may have newline characters as individual tokens, the others will
    // not
    // 2) The tokenized/sentence split tokens may be bracket tokens broken into separate tokens.
    Vector<String> parsedTokensFlat = new Vector<>();
    for (int i = 0; i < parsedTokens.size(); i++) for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) parsedTokensFlat.addElement(parsedTokens.elementAt(i).elementAt(j));
    // logger.info("----"+parsedTokensFlat.size());
    // to be filled later
    Vector<String> parsedTokensTagsFlat = new Vector<>();
    StringBuilder bracketTokensText = new StringBuilder(bracketTokens.size() * 20);
    StringBuilder parsedTokensText = new StringBuilder(parsedTokensFlat.size() * 20);
    int bracketsTokensPos = 0;
    int parsedTokensPos = 0;
    while (bracketsTokensPos < bracketTokens.size()) {
        while (bracketsTokensPos < bracketTokens.size() && bracketTokens.elementAt(bracketsTokensPos).equals("\n")) bracketsTokensPos++;
        if (bracketsTokensPos < bracketTokens.size()) {
            bracketTokensText.append(" ").append(bracketTokens.elementAt(bracketsTokensPos));
            String currentLabel = bracketTokensTags.elementAt(bracketsTokensPos);
            parsedTokensTagsFlat.addElement(currentLabel);
            parsedTokensText.append(" ").append(parsedTokensFlat.elementAt(parsedTokensPos));
            parsedTokensPos++;
            while ((!bracketTokensText.toString().equals(parsedTokensText.toString())) && parsedTokensPos < parsedTokensFlat.size()) {
                if (currentLabel.startsWith("B-"))
                    parsedTokensTagsFlat.addElement("I-" + currentLabel.substring(2));
                else
                    parsedTokensTagsFlat.addElement(currentLabel);
                parsedTokensText.append(parsedTokensFlat.elementAt(parsedTokensPos));
                parsedTokensPos++;
            }
            if (!bracketTokensText.toString().equals(parsedTokensText.toString()))
                throw new Exception("Error aligning raw brackets tokens to token/sentence split tokens\nBrackets token text till now:\n" + bracketTokensText + "\nTokenized text till now:\n" + parsedTokensText);
            bracketsTokensPos++;
        }
    }
    // ok, we're done, just building the output sentences
    ArrayList<LinkedVector> res = new ArrayList<>();
    parsedTokensPos = 0;
    for (int i = 0; i < parsedTokens.size(); i++) {
        LinkedVector sentence = new LinkedVector();
        for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) {
            NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), parsedTokensTagsFlat.elementAt(parsedTokensPos));
            parsedTokensPos++;
        }
        res.add(sentence);
    }
    return new NERDocument(res, docname);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 2 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class TaggedDataReader method readFolder.

public static Vector<NERDocument> readFolder(String path, String format) throws Exception {
    Vector<NERDocument> res = new Vector<>();
    String[] files = (new File(path)).list();
    // sort the files so we can get deterministic order.
    if (ParametersForLbjCode.currentParameters.sortLexicallyFilesInFolders) {
        Arrays.sort(files);
    }
    for (String file1 : files) {
        String file = path + "/" + file1;
        if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
            res.addElement(readFile(file, format, file1));
        }
    }
    if (ParametersForLbjCode.currentParameters.treatAllFilesInFolderAsOneBigDocument) {
        // connecting sentence boundaries
        for (int i = 0; i < res.size() - 1; i++) {
            ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
            ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
            if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
                NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
                NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
                lastWord1.nextIgnoreSentenceBoundary = firstWord2;
                firstWord2.previousIgnoreSentenceBoundary = lastWord1;
            }
        }
    }
    return res;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) File(java.io.File) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 3 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class ReferenceUtils method createNerDataStructuresForText.

public Data createNerDataStructuresForText(TextAnnotation ta) {
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    String[] tokens = ta.getTokens();
    int[] tokenindices = new int[tokens.length];
    int tokenIndex = 0;
    int neWordIndex = 0;
    for (int i = 0; i < ta.getNumberOfSentences(); i++) {
        Sentence sentence = ta.getSentence(i);
        String[] wtoks = sentence.getTokens();
        LinkedVector words = new LinkedVector();
        for (String w : wtoks) {
            if (w.length() > 0) {
                NEWord.addTokenToSentence(words, w, "unlabeled");
                tokenindices[neWordIndex] = tokenIndex;
                neWordIndex++;
            } else {
                throw new IllegalStateException("Bad (zero length) token.");
            }
            tokenIndex++;
        }
        if (words.size() > 0)
            sentences.add(words);
    }
    // Do the annotation.
    Data data = new Data(new NERDocument(sentences, "input"));
    return data;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) Data(edu.illinois.cs.cogcomp.ner.LbjTagger.Data) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 4 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class TaggedDataReader method readFile.

public static NERDocument readFile(String path, String format, String documentName) throws Exception {
    NERDocument res = null;
    if (format.equals("-c")) {
        res = (new ColumnFileReader(path)).read(documentName);
    } else {
        if (format.equals("-r")) {
            res = BracketFileReader.read(path, documentName);
        } else {
            System.err.println("Fatal error: unrecognized file format: " + format);
            System.exit(0);
        }
    }
    connectSentenceBoundaries(res.sentences);
    return res;
}
Also used : NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)

Example 5 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class BracketFileReader method parseTextWithBrackets.

public static NERDocument parseTextWithBrackets(String annotatedText, String docname, ParametersForLbjCode cp) throws Exception {
    if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0)
        return new NERDocument(new ArrayList<LinkedVector>(), docname);
    // can include newlines!!!!
    Vector<String> bracketTokens = new Vector<>();
    Vector<String> bracketTokensTags = new Vector<>();
    parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens, cp);
    StringBuilder buff = new StringBuilder(bracketTokens.size() * 20);
    for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" ");
    // the tokens below will have no newline characters.
    // logger.info("Raw text: "+buff);
    Vector<Vector<String>> parsedTokens = PlainTextReader.sentenceSplitAndTokenizeText(buff.toString(), cp);
    // now we need to align the bracket tokens to the sentence split and tokenized tokens.
    // there are two issues to be careful with -
    // 1) The bracket tokens may have newline characters as individual tokens, the others will
    // not
    // 2) The tokenized/sentence split tokens may be bracket tokens broken into separate tokens.
    Vector<String> parsedTokensFlat = new Vector<>();
    for (int i = 0; i < parsedTokens.size(); i++) for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) parsedTokensFlat.addElement(parsedTokens.elementAt(i).elementAt(j));
    // logger.info("----"+parsedTokensFlat.size());
    // to be filled later
    Vector<String> parsedTokensTagsFlat = new Vector<>();
    StringBuilder bracketTokensText = new StringBuilder(bracketTokens.size() * 20);
    StringBuilder parsedTokensText = new StringBuilder(parsedTokensFlat.size() * 20);
    int bracketsTokensPos = 0;
    int parsedTokensPos = 0;
    while (bracketsTokensPos < bracketTokens.size()) {
        while (bracketsTokensPos < bracketTokens.size() && bracketTokens.elementAt(bracketsTokensPos).equals("\n")) bracketsTokensPos++;
        if (bracketsTokensPos < bracketTokens.size()) {
            bracketTokensText.append(" ").append(bracketTokens.elementAt(bracketsTokensPos));
            String currentLabel = bracketTokensTags.elementAt(bracketsTokensPos);
            parsedTokensTagsFlat.addElement(currentLabel);
            parsedTokensText.append(" ").append(parsedTokensFlat.elementAt(parsedTokensPos));
            parsedTokensPos++;
            while ((!bracketTokensText.toString().equals(parsedTokensText.toString())) && parsedTokensPos < parsedTokensFlat.size()) {
                if (currentLabel.startsWith("B-"))
                    parsedTokensTagsFlat.addElement("I-" + currentLabel.substring(2));
                else
                    parsedTokensTagsFlat.addElement(currentLabel);
                parsedTokensText.append(parsedTokensFlat.elementAt(parsedTokensPos));
                parsedTokensPos++;
            }
            if (!bracketTokensText.toString().equals(parsedTokensText.toString()))
                throw new Exception("Error aligning raw brackets tokens to token/sentence split tokens\nBrackets token text till now:\n" + bracketTokensText + "\nTokenized text till now:\n" + parsedTokensText);
            bracketsTokensPos++;
        }
    }
    // ok, we're done, just building the output sentences
    ArrayList<LinkedVector> res = new ArrayList<>();
    parsedTokensPos = 0;
    for (int i = 0; i < parsedTokens.size(); i++) {
        LinkedVector sentence = new LinkedVector();
        for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) {
            NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), parsedTokensTagsFlat.elementAt(parsedTokensPos), cp);
            parsedTokensPos++;
        }
        res.add(sentence);
    }
    return new NERDocument(res, docname);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Aggregations

NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)10 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)7 ArrayList (java.util.ArrayList)5 Vector (java.util.Vector)4 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)3 Data (edu.illinois.cs.cogcomp.ner.LbjTagger.Data)3 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)3 File (java.io.File)2 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1