use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class WordEmbeddings method printOovData.
public static void printOovData(Data data) {
HashMap<String, Boolean> tokensHash = new HashMap<>();
HashMap<String, Boolean> tokensHashIC = new HashMap<>();
for (int docid = 0; docid < data.documents.size(); docid++) {
NERDocument doc = data.documents.get(docid);
for (int i = 0; i < doc.sentences.size(); i++) for (int j = 0; j < doc.sentences.get(i).size(); j++) {
String form = ((NEWord) doc.sentences.get(i).get(j)).form;
tokensHash.put(form, true);
tokensHashIC.put(form.toLowerCase(), true);
}
}
// logger.info("\t\t- Total unique tokens ignore case ="+ tokensHashIC.size());
for (int resourceId = 0; resourceId < resources.size(); resourceId++) {
HashMap<String, double[]> embedding = embeddingByResource.elementAt(resourceId);
HashMap<String, Boolean> oovCaseSensitiveHash = new HashMap<>();
HashMap<String, Boolean> oovAfterLowercasingHash = new HashMap<>();
for (int docid = 0; docid < data.documents.size(); docid++) {
NERDocument doc = data.documents.get(docid);
for (int i = 0; i < doc.sentences.size(); i++) for (int j = 0; j < doc.sentences.get(i).size(); j++) {
String form = ((NEWord) doc.sentences.get(i).get(j)).form;
if (!embedding.containsKey(form)) {
oovCaseSensitiveHash.put(form, true);
}
if ((!embedding.containsKey(form)) && (!embedding.containsKey(form.toLowerCase()))) {
oovAfterLowercasingHash.put(form.toLowerCase(), true);
}
}
}
// logger.info("\t\t- Total OOV tokens, Case Sensitive ="+ oovCaseSensitive);
// logger.info("\t\t- OOV tokens, no repetitions, Case Sensitive ="+
// oovCaseSensitiveHash.size());
// logger.info("\t\t- Total OOV tokens even after lowercasing ="+
// oovAfterLowercasing);
// logger.info("\t\t- OOV tokens even after lowercasing, no repetition ="+
// oovAfterLowercasingHash.size());
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class NERAnnotator method addView.
/**
* Generate the view representing the list of extracted entities and adds it the
* {@link TextAnnotation}.
*/
@Override
public void addView(TextAnnotation ta) {
// convert this data structure into one the NER package can deal with.
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();
int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
String[] wtoks = sentence.getTokens();
LinkedVector words = new LinkedVector();
for (String w : wtoks) {
if (w.length() > 0) {
NEWord.addTokenToSentence(words, w, "unlabeled", this.params);
tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
logger.error("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
// Do the annotation.
Data data = new Data(new NERDocument(sentences, "input"));
try {
ExpressiveFeaturesAnnotator.annotate(data, this.params);
Decoder.annotateDataBIO(data, params);
} catch (Exception e) {
logger.error("Cannot annotate the text, the exception was: ", e);
return;
}
// now we have the parsed entities, construct the view object.
ArrayList<LinkedVector> nerSentences = data.documents.get(0).sentences;
SpanLabelView nerView = new SpanLabelView(getViewName(), ta);
// the data always has a single document
// each LinkedVector in data corresponds to a sentence.
int tokenoffset = 0;
for (LinkedVector vector : nerSentences) {
boolean open = false;
// there should be a 1:1 mapping btw sentence tokens in record and words/predictions
// from NER.
int startIndex = -1;
String label = null;
for (int j = 0; j < vector.size(); j++, tokenoffset++) {
NEWord neWord = (NEWord) (vector.get(j));
String prediction = neWord.neTypeLevel2;
// inefficient, use enums, or nominalized indexes for this sort of thing.
if (prediction.startsWith("B-")) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
} else if (j > 0) {
String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2;
if (prediction.startsWith("I-") && (!previous_prediction.endsWith(prediction.substring(2)))) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
}
}
if (open) {
boolean close = false;
if (j == vector.size() - 1) {
close = true;
} else {
String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2;
if (next_prediction.startsWith("B-"))
close = true;
if (next_prediction.equals("O"))
close = true;
if (next_prediction.indexOf('-') > -1 && (!prediction.endsWith(next_prediction.substring(2))))
close = true;
}
if (close) {
int s = tokenindices[startIndex];
/*
* MS: fixed bug. Originally, e was set using tokenindices[tokenoffset], but
* tokenoffset can reach tokens.length) and this exceeds array length.
* Constituent constructor requires one-past-the-end token indexing,
* requiring e > s. Hence the complicated setting of endIndex/e below.
*/
int endIndex = Math.min(tokenoffset + 1, tokens.length - 1);
int e = tokenindices[endIndex];
if (e <= s)
e = s + 1;
nerView.addSpanLabel(s, e, label, 1d);
open = false;
}
}
}
}
ta.addView(viewName, nerView);
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class ReferenceUtils method createNerDataStructuresForText.
public Data createNerDataStructuresForText(TextAnnotation ta, ParametersForLbjCode params) {
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();
int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
String[] wtoks = sentence.getTokens();
LinkedVector words = new LinkedVector();
for (String w : wtoks) {
if (w.length() > 0) {
NEWord.addTokenToSentence(words, w, "unlabeled", params);
tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
throw new IllegalStateException("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
// Do the annotation.
Data data = new Data(new NERDocument(sentences, "input"));
return data;
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class TaggedDataReader method readFolder.
public static Vector<NERDocument> readFolder(String path, String format, ParametersForLbjCode cp) throws Exception {
Vector<NERDocument> res = new Vector<>();
String[] files = (new File(path)).list();
// sort the files so we can get deterministic order.
if (cp.sortLexicallyFilesInFolders) {
Arrays.sort(files);
}
for (String file1 : files) {
String file = path + "/" + file1;
if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
res.addElement(readFile(file, format, file1, cp));
}
}
if (cp.treatAllFilesInFolderAsOneBigDocument) {
// connecting sentence boundaries
for (int i = 0; i < res.size() - 1; i++) {
ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
lastWord1.nextIgnoreSentenceBoundary = firstWord2;
firstWord2.previousIgnoreSentenceBoundary = lastWord1;
}
}
}
logger.info("Read " + files.length + " files from " + path);
return res;
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class TaggedDataReader method readFile.
public static NERDocument readFile(String path, String format, String documentName, ParametersForLbjCode cp) throws Exception {
NERDocument res = null;
if (format.equals("-c")) {
res = (new ColumnFileReader(path, cp)).read(documentName);
} else if (format.equals("-r")) {
res = BracketFileReader.read(path, documentName, cp);
} else if (format.equals("-json")) {
TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(path, true);
res = TextAnnotationConverter.getNerDocument(ta, cp);
} else {
System.err.println("Fatal error: unrecognized file format: " + format);
System.exit(0);
}
connectSentenceBoundaries(res.sentences);
return res;
}
Aggregations