Search in sources :

Example 16 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TaggedDataWriter method toBracketsFormat.

/*
     * labelType=NEWord.GoldLabel/NEWord.PredictionLevel2Tagger/NEWord.PredictionLevel1Tagger
     * 
     * Note : the only reason this function is public is because we want to be able to use it in the
     * demo and insert html tags into the string
     */
public static String toBracketsFormat(Data data, NEWord.LabelToLookAt labelType) {
    StringBuilder res = new StringBuilder(data.documents.size() * 1000);
    for (int did = 0; did < data.documents.size(); did++) {
        for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
            LinkedVector vector = data.documents.get(did).sentences.get(i);
            boolean open = false;
            String[] predictions = new String[vector.size()];
            String[] words = new String[vector.size()];
            for (int j = 0; j < vector.size(); j++) {
                predictions[j] = null;
                if (labelType == NEWord.LabelToLookAt.PredictionLevel2Tagger)
                    predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
                if (labelType == NEWord.LabelToLookAt.PredictionLevel1Tagger)
                    predictions[j] = ((NEWord) vector.get(j)).neTypeLevel1;
                if (labelType == NEWord.LabelToLookAt.GoldLabel)
                    predictions[j] = ((NEWord) vector.get(j)).neLabel;
                words[j] = ((NEWord) vector.get(j)).form;
            }
            for (int j = 0; j < vector.size(); j++) {
                if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
                    res.append("[").append(predictions[j].substring(2)).append(" ");
                    open = true;
                }
                res.append(words[j]).append(" ");
                if (open) {
                    boolean close = false;
                    if (j == vector.size() - 1) {
                        close = true;
                    } else {
                        if (predictions[j + 1].startsWith("B-"))
                            close = true;
                        if (predictions[j + 1].equals("O"))
                            close = true;
                        if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
                            close = true;
                    }
                    if (close) {
                        res.append(" ] ");
                        open = false;
                    }
                }
            }
            res.append("\n");
        }
    }
    return res.toString();
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 17 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TaggedDataReader method readFolder.

public static Vector<NERDocument> readFolder(String path, String format, ParametersForLbjCode cp) throws Exception {
    Vector<NERDocument> res = new Vector<>();
    String[] files = (new File(path)).list();
    // sort the files so we can get deterministic order.
    if (cp.sortLexicallyFilesInFolders) {
        Arrays.sort(files);
    }
    for (String file1 : files) {
        String file = path + "/" + file1;
        if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
            res.addElement(readFile(file, format, file1, cp));
        }
    }
    if (cp.treatAllFilesInFolderAsOneBigDocument) {
        // connecting sentence boundaries
        for (int i = 0; i < res.size() - 1; i++) {
            ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
            ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
            if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
                NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
                NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
                lastWord1.nextIgnoreSentenceBoundary = firstWord2;
                firstWord2.previousIgnoreSentenceBoundary = lastWord1;
            }
        }
    }
    logger.info("Read " + files.length + " files from " + path);
    return res;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Vector(java.util.Vector) File(java.io.File) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 18 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class ColumnFileReader method next.

public Object next() {
    String token = null;
    String pos = null;
    String label = null;
    linec++;
    // Skip to start of next line, skip unnecessary blank lines, headers and so on.
    String[] line = (String[]) super.next();
    while (line != null && (line.length == 0 || (line.length > 4 && line[4].equals("-X-")))) {
        line = (String[]) super.next();
        linec++;
    }
    if (line == null)
        return null;
    // parse the data, CoNLL 2002 or CoNLL 2003.
    if (line.length == 2) {
        token = line[0];
        label = line[1];
    } else {
        token = line[5];
        label = line[0];
        pos = line[4];
    }
    LinkedVector res = new LinkedVector();
    NEWord w = new NEWord(new Word(token, pos), null, label);
    NEWord.addTokenToSentence(res, w.form, w.neLabel, params);
    for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) {
        linec++;
        // parse the data, CoNLL 2002 or CoNLL 2003.
        if (line.length == 2) {
            token = line[0];
            label = line[1];
        } else if (line.length > 5) {
            token = line[5];
            label = line[0];
            pos = line[4];
        } else {
            System.out.println("Line " + linec + " in " + filename + " is wrong with " + line.length);
            for (String a : line) System.out.print(":" + a);
            System.out.println();
            continue;
        }
        w = new NEWord(new Word(token, pos), null, label);
        NEWord.addTokenToSentence(res, w.form, w.neLabel, params);
    }
    if (res.size() == 0)
        return null;
    return res;
}
Also used : NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Aggregations

NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)18 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)12 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Vector (java.util.Vector)3 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)2 CharacteristicWords (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords)2 File (java.io.File)2 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 Data (edu.illinois.cs.cogcomp.ner.LbjTagger.Data)1 NamedEntity (edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity)1 ParametersForLbjCode (edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode)1 MyString (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.MyString)1 OccurrenceCounter (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter)1 THashMap (gnu.trove.map.hash.THashMap)1 IOException (java.io.IOException)1 Hashtable (java.util.Hashtable)1