Search in sources :

Example 6 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class WordEmbeddings method printOovData.

public static void printOovData(Data data) {
    HashMap<String, Boolean> tokensHash = new HashMap<>();
    HashMap<String, Boolean> tokensHashIC = new HashMap<>();
    for (int docid = 0; docid < data.documents.size(); docid++) {
        NERDocument doc = data.documents.get(docid);
        for (int i = 0; i < doc.sentences.size(); i++) for (int j = 0; j < doc.sentences.get(i).size(); j++) {
            String form = ((NEWord) doc.sentences.get(i).get(j)).form;
            tokensHash.put(form, true);
            tokensHashIC.put(form.toLowerCase(), true);
        }
    }
    // logger.info("\t\t- Total unique tokens ignore case ="+ tokensHashIC.size());
    for (int resourceId = 0; resourceId < resources.size(); resourceId++) {
        HashMap<String, double[]> embedding = embeddingByResource.elementAt(resourceId);
        HashMap<String, Boolean> oovCaseSensitiveHash = new HashMap<>();
        HashMap<String, Boolean> oovAfterLowercasingHash = new HashMap<>();
        for (int docid = 0; docid < data.documents.size(); docid++) {
            NERDocument doc = data.documents.get(docid);
            for (int i = 0; i < doc.sentences.size(); i++) for (int j = 0; j < doc.sentences.get(i).size(); j++) {
                String form = ((NEWord) doc.sentences.get(i).get(j)).form;
                if (!embedding.containsKey(form)) {
                    oovCaseSensitiveHash.put(form, true);
                }
                if ((!embedding.containsKey(form)) && (!embedding.containsKey(form.toLowerCase()))) {
                    oovAfterLowercasingHash.put(form.toLowerCase(), true);
                }
            }
        }
    // logger.info("\t\t- Total OOV tokens, Case Sensitive ="+ oovCaseSensitive);
    // logger.info("\t\t- OOV tokens, no repetitions, Case Sensitive ="+
    // oovCaseSensitiveHash.size());
    // logger.info("\t\t- Total OOV tokens even after lowercasing  ="+
    // oovAfterLowercasing);
    // logger.info("\t\t- OOV tokens even after lowercasing, no repetition  ="+
    // oovAfterLowercasingHash.size());
    }
}
Also used : HashMap(java.util.HashMap) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)

Example 7 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class NERAnnotator method addView.

/**
 * Generate the view representing the list of extracted entities and adds it the
 * {@link TextAnnotation}.
 */
@Override
public void addView(TextAnnotation ta) {
    // convert this data structure into one the NER package can deal with.
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    String[] tokens = ta.getTokens();
    int[] tokenindices = new int[tokens.length];
    int tokenIndex = 0;
    int neWordIndex = 0;
    for (int i = 0; i < ta.getNumberOfSentences(); i++) {
        Sentence sentence = ta.getSentence(i);
        String[] wtoks = sentence.getTokens();
        LinkedVector words = new LinkedVector();
        for (String w : wtoks) {
            if (w.length() > 0) {
                NEWord.addTokenToSentence(words, w, "unlabeled", this.params);
                tokenindices[neWordIndex] = tokenIndex;
                neWordIndex++;
            } else {
                logger.error("Bad (zero length) token.");
            }
            tokenIndex++;
        }
        if (words.size() > 0)
            sentences.add(words);
    }
    // Do the annotation.
    Data data = new Data(new NERDocument(sentences, "input"));
    try {
        ExpressiveFeaturesAnnotator.annotate(data, this.params);
        Decoder.annotateDataBIO(data, params);
    } catch (Exception e) {
        logger.error("Cannot annotate the text, the exception was: ", e);
        return;
    }
    // now we have the parsed entities, construct the view object.
    ArrayList<LinkedVector> nerSentences = data.documents.get(0).sentences;
    SpanLabelView nerView = new SpanLabelView(getViewName(), ta);
    // the data always has a single document
    // each LinkedVector in data corresponds to a sentence.
    int tokenoffset = 0;
    for (LinkedVector vector : nerSentences) {
        boolean open = false;
        // there should be a 1:1 mapping btw sentence tokens in record and words/predictions
        // from NER.
        int startIndex = -1;
        String label = null;
        for (int j = 0; j < vector.size(); j++, tokenoffset++) {
            NEWord neWord = (NEWord) (vector.get(j));
            String prediction = neWord.neTypeLevel2;
            // inefficient, use enums, or nominalized indexes for this sort of thing.
            if (prediction.startsWith("B-")) {
                startIndex = tokenoffset;
                label = prediction.substring(2);
                open = true;
            } else if (j > 0) {
                String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2;
                if (prediction.startsWith("I-") && (!previous_prediction.endsWith(prediction.substring(2)))) {
                    startIndex = tokenoffset;
                    label = prediction.substring(2);
                    open = true;
                }
            }
            if (open) {
                boolean close = false;
                if (j == vector.size() - 1) {
                    close = true;
                } else {
                    String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2;
                    if (next_prediction.startsWith("B-"))
                        close = true;
                    if (next_prediction.equals("O"))
                        close = true;
                    if (next_prediction.indexOf('-') > -1 && (!prediction.endsWith(next_prediction.substring(2))))
                        close = true;
                }
                if (close) {
                    int s = tokenindices[startIndex];
                    /*
                         * MS: fixed bug. Originally, e was set using tokenindices[tokenoffset], but
                         * tokenoffset can reach tokens.length) and this exceeds array length.
                         * Constituent constructor requires one-past-the-end token indexing,
                         * requiring e > s. Hence the complicated setting of endIndex/e below.
                         */
                    int endIndex = Math.min(tokenoffset + 1, tokens.length - 1);
                    int e = tokenindices[endIndex];
                    if (e <= s)
                        e = s + 1;
                    nerView.addSpanLabel(s, e, label, 1d);
                    open = false;
                }
            }
        }
    }
    ta.addView(viewName, nerView);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) Data(edu.illinois.cs.cogcomp.ner.LbjTagger.Data) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) IOException(java.io.IOException) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 8 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class ReferenceUtils method createNerDataStructuresForText.

public Data createNerDataStructuresForText(TextAnnotation ta, ParametersForLbjCode params) {
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    String[] tokens = ta.getTokens();
    int[] tokenindices = new int[tokens.length];
    int tokenIndex = 0;
    int neWordIndex = 0;
    for (int i = 0; i < ta.getNumberOfSentences(); i++) {
        Sentence sentence = ta.getSentence(i);
        String[] wtoks = sentence.getTokens();
        LinkedVector words = new LinkedVector();
        for (String w : wtoks) {
            if (w.length() > 0) {
                NEWord.addTokenToSentence(words, w, "unlabeled", params);
                tokenindices[neWordIndex] = tokenIndex;
                neWordIndex++;
            } else {
                throw new IllegalStateException("Bad (zero length) token.");
            }
            tokenIndex++;
        }
        if (words.size() > 0)
            sentences.add(words);
    }
    // Do the annotation.
    Data data = new Data(new NERDocument(sentences, "input"));
    return data;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) Data(edu.illinois.cs.cogcomp.ner.LbjTagger.Data) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 9 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class TaggedDataReader method readFolder.

public static Vector<NERDocument> readFolder(String path, String format, ParametersForLbjCode cp) throws Exception {
    Vector<NERDocument> res = new Vector<>();
    String[] files = (new File(path)).list();
    // sort the files so we can get deterministic order.
    if (cp.sortLexicallyFilesInFolders) {
        Arrays.sort(files);
    }
    for (String file1 : files) {
        String file = path + "/" + file1;
        if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
            res.addElement(readFile(file, format, file1, cp));
        }
    }
    if (cp.treatAllFilesInFolderAsOneBigDocument) {
        // connecting sentence boundaries
        for (int i = 0; i < res.size() - 1; i++) {
            ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
            ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
            if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
                NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
                NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
                lastWord1.nextIgnoreSentenceBoundary = firstWord2;
                firstWord2.previousIgnoreSentenceBoundary = lastWord1;
            }
        }
    }
    logger.info("Read " + files.length + " files from " + path);
    return res;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Vector(java.util.Vector) File(java.io.File) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 10 with NERDocument

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.

the class TaggedDataReader method readFile.

public static NERDocument readFile(String path, String format, String documentName, ParametersForLbjCode cp) throws Exception {
    NERDocument res = null;
    if (format.equals("-c")) {
        res = (new ColumnFileReader(path, cp)).read(documentName);
    } else if (format.equals("-r")) {
        res = BracketFileReader.read(path, documentName, cp);
    } else if (format.equals("-json")) {
        TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(path, true);
        res = TextAnnotationConverter.getNerDocument(ta, cp);
    } else {
        System.err.println("Fatal error: unrecognized file format: " + format);
        System.exit(0);
    }
    connectSentenceBoundaries(res.sentences);
    return res;
}
Also used : NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Aggregations

NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)10 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)7 ArrayList (java.util.ArrayList)5 Vector (java.util.Vector)4 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)3 Data (edu.illinois.cs.cogcomp.ner.LbjTagger.Data)3 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)3 File (java.io.File)2 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1