use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class TaggedDataWriter method toBracketsFormat.
/*
* labelType=NEWord.GoldLabel/NEWord.PredictionLevel2Tagger/NEWord.PredictionLevel1Tagger
*
* Note : the only reason this function is public is because we want to be able to use it in the
* demo and insert html tags into the string
*/
public static String toBracketsFormat(Data data, NEWord.LabelToLookAt labelType) {
StringBuilder res = new StringBuilder(data.documents.size() * 1000);
for (int did = 0; did < data.documents.size(); did++) {
for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
LinkedVector vector = data.documents.get(did).sentences.get(i);
boolean open = false;
String[] predictions = new String[vector.size()];
String[] words = new String[vector.size()];
for (int j = 0; j < vector.size(); j++) {
predictions[j] = null;
if (labelType == NEWord.LabelToLookAt.PredictionLevel2Tagger)
predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
if (labelType == NEWord.LabelToLookAt.PredictionLevel1Tagger)
predictions[j] = ((NEWord) vector.get(j)).neTypeLevel1;
if (labelType == NEWord.LabelToLookAt.GoldLabel)
predictions[j] = ((NEWord) vector.get(j)).neLabel;
words[j] = ((NEWord) vector.get(j)).form;
}
for (int j = 0; j < vector.size(); j++) {
if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
res.append("[").append(predictions[j].substring(2)).append(" ");
open = true;
}
res.append(words[j]).append(" ");
if (open) {
boolean close = false;
if (j == vector.size() - 1) {
close = true;
} else {
if (predictions[j + 1].startsWith("B-"))
close = true;
if (predictions[j + 1].equals("O"))
close = true;
if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
close = true;
}
if (close) {
res.append(" ] ");
open = false;
}
}
}
res.append("\n");
}
}
return res.toString();
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class TaggedDataReader method readFolder.
public static Vector<NERDocument> readFolder(String path, String format, ParametersForLbjCode cp) throws Exception {
Vector<NERDocument> res = new Vector<>();
String[] files = (new File(path)).list();
// sort the files so we can get deterministic order.
if (cp.sortLexicallyFilesInFolders) {
Arrays.sort(files);
}
for (String file1 : files) {
String file = path + "/" + file1;
if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
res.addElement(readFile(file, format, file1, cp));
}
}
if (cp.treatAllFilesInFolderAsOneBigDocument) {
// connecting sentence boundaries
for (int i = 0; i < res.size() - 1; i++) {
ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
lastWord1.nextIgnoreSentenceBoundary = firstWord2;
firstWord2.previousIgnoreSentenceBoundary = lastWord1;
}
}
}
logger.info("Read " + files.length + " files from " + path);
return res;
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class ColumnFileReader method next.
public Object next() {
String token = null;
String pos = null;
String label = null;
linec++;
// Skip to start of next line, skip unnecessary blank lines, headers and so on.
String[] line = (String[]) super.next();
while (line != null && (line.length == 0 || (line.length > 4 && line[4].equals("-X-")))) {
line = (String[]) super.next();
linec++;
}
if (line == null)
return null;
// parse the data, CoNLL 2002 or CoNLL 2003.
if (line.length == 2) {
token = line[0];
label = line[1];
} else {
token = line[5];
label = line[0];
pos = line[4];
}
LinkedVector res = new LinkedVector();
NEWord w = new NEWord(new Word(token, pos), null, label);
NEWord.addTokenToSentence(res, w.form, w.neLabel, params);
for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) {
linec++;
// parse the data, CoNLL 2002 or CoNLL 2003.
if (line.length == 2) {
token = line[0];
label = line[1];
} else if (line.length > 5) {
token = line[5];
label = line[0];
pos = line[4];
} else {
System.out.println("Line " + linec + " in " + filename + " is wrong with " + line.length);
for (String a : line) System.out.print(":" + a);
System.out.println();
continue;
}
w = new NEWord(new Word(token, pos), null, label);
NEWord.addTokenToSentence(res, w.form, w.neLabel, params);
}
if (res.size() == 0)
return null;
return res;
}
Aggregations