use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class BracketFileReader method parseTextWithBrackets.
public static NERDocument parseTextWithBrackets(String annotatedText, String docname) throws Exception {
if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0)
return new NERDocument(new ArrayList<LinkedVector>(), docname);
// can include newlines!!!!
Vector<String> bracketTokens = new Vector<>();
Vector<String> bracketTokensTags = new Vector<>();
parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens);
StringBuilder buff = new StringBuilder(bracketTokens.size() * 20);
for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" ");
// the tokens below will have no newline characters.
// logger.info("Raw text: "+buff);
Vector<Vector<String>> parsedTokens = PlainTextReader.sentenceSplitAndTokenizeText(buff.toString());
// now we need to align the bracket tokens to the sentence split and tokenized tokens.
// there are two issues to be careful with -
// 1) The bracket tokens may have newline characters as individual tokens, the others will
// not
// 2) The tokenized/sentence split tokens may be bracket tokens broken into separate tokens.
Vector<String> parsedTokensFlat = new Vector<>();
for (int i = 0; i < parsedTokens.size(); i++) for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) parsedTokensFlat.addElement(parsedTokens.elementAt(i).elementAt(j));
// logger.info("----"+parsedTokensFlat.size());
// to be filled later
Vector<String> parsedTokensTagsFlat = new Vector<>();
StringBuilder bracketTokensText = new StringBuilder(bracketTokens.size() * 20);
StringBuilder parsedTokensText = new StringBuilder(parsedTokensFlat.size() * 20);
int bracketsTokensPos = 0;
int parsedTokensPos = 0;
while (bracketsTokensPos < bracketTokens.size()) {
while (bracketsTokensPos < bracketTokens.size() && bracketTokens.elementAt(bracketsTokensPos).equals("\n")) bracketsTokensPos++;
if (bracketsTokensPos < bracketTokens.size()) {
bracketTokensText.append(" ").append(bracketTokens.elementAt(bracketsTokensPos));
String currentLabel = bracketTokensTags.elementAt(bracketsTokensPos);
parsedTokensTagsFlat.addElement(currentLabel);
parsedTokensText.append(" ").append(parsedTokensFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
while ((!bracketTokensText.toString().equals(parsedTokensText.toString())) && parsedTokensPos < parsedTokensFlat.size()) {
if (currentLabel.startsWith("B-"))
parsedTokensTagsFlat.addElement("I-" + currentLabel.substring(2));
else
parsedTokensTagsFlat.addElement(currentLabel);
parsedTokensText.append(parsedTokensFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
}
if (!bracketTokensText.toString().equals(parsedTokensText.toString()))
throw new Exception("Error aligning raw brackets tokens to token/sentence split tokens\nBrackets token text till now:\n" + bracketTokensText + "\nTokenized text till now:\n" + parsedTokensText);
bracketsTokensPos++;
}
}
// ok, we're done, just building the output sentences
ArrayList<LinkedVector> res = new ArrayList<>();
parsedTokensPos = 0;
for (int i = 0; i < parsedTokens.size(); i++) {
LinkedVector sentence = new LinkedVector();
for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) {
NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), parsedTokensTagsFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
}
res.add(sentence);
}
return new NERDocument(res, docname);
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class TaggedDataReader method readFolder.
public static Vector<NERDocument> readFolder(String path, String format) throws Exception {
Vector<NERDocument> res = new Vector<>();
String[] files = (new File(path)).list();
// sort the files so we can get deterministic order.
if (ParametersForLbjCode.currentParameters.sortLexicallyFilesInFolders) {
Arrays.sort(files);
}
for (String file1 : files) {
String file = path + "/" + file1;
if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
res.addElement(readFile(file, format, file1));
}
}
if (ParametersForLbjCode.currentParameters.treatAllFilesInFolderAsOneBigDocument) {
// connecting sentence boundaries
for (int i = 0; i < res.size() - 1; i++) {
ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
lastWord1.nextIgnoreSentenceBoundary = firstWord2;
firstWord2.previousIgnoreSentenceBoundary = lastWord1;
}
}
}
return res;
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class ReferenceUtils method createNerDataStructuresForText.
public Data createNerDataStructuresForText(TextAnnotation ta) {
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();
int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
String[] wtoks = sentence.getTokens();
LinkedVector words = new LinkedVector();
for (String w : wtoks) {
if (w.length() > 0) {
NEWord.addTokenToSentence(words, w, "unlabeled");
tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
throw new IllegalStateException("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
// Do the annotation.
Data data = new Data(new NERDocument(sentences, "input"));
return data;
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class TaggedDataReader method readFile.
public static NERDocument readFile(String path, String format, String documentName) throws Exception {
NERDocument res = null;
if (format.equals("-c")) {
res = (new ColumnFileReader(path)).read(documentName);
} else {
if (format.equals("-r")) {
res = BracketFileReader.read(path, documentName);
} else {
System.err.println("Fatal error: unrecognized file format: " + format);
System.exit(0);
}
}
connectSentenceBoundaries(res.sentences);
return res;
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument in project cogcomp-nlp by CogComp.
the class BracketFileReader method parseTextWithBrackets.
public static NERDocument parseTextWithBrackets(String annotatedText, String docname, ParametersForLbjCode cp) throws Exception {
if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0)
return new NERDocument(new ArrayList<LinkedVector>(), docname);
// can include newlines!!!!
Vector<String> bracketTokens = new Vector<>();
Vector<String> bracketTokensTags = new Vector<>();
parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens, cp);
StringBuilder buff = new StringBuilder(bracketTokens.size() * 20);
for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" ");
// the tokens below will have no newline characters.
// logger.info("Raw text: "+buff);
Vector<Vector<String>> parsedTokens = PlainTextReader.sentenceSplitAndTokenizeText(buff.toString(), cp);
// now we need to align the bracket tokens to the sentence split and tokenized tokens.
// there are two issues to be careful with -
// 1) The bracket tokens may have newline characters as individual tokens, the others will
// not
// 2) The tokenized/sentence split tokens may be bracket tokens broken into separate tokens.
Vector<String> parsedTokensFlat = new Vector<>();
for (int i = 0; i < parsedTokens.size(); i++) for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) parsedTokensFlat.addElement(parsedTokens.elementAt(i).elementAt(j));
// logger.info("----"+parsedTokensFlat.size());
// to be filled later
Vector<String> parsedTokensTagsFlat = new Vector<>();
StringBuilder bracketTokensText = new StringBuilder(bracketTokens.size() * 20);
StringBuilder parsedTokensText = new StringBuilder(parsedTokensFlat.size() * 20);
int bracketsTokensPos = 0;
int parsedTokensPos = 0;
while (bracketsTokensPos < bracketTokens.size()) {
while (bracketsTokensPos < bracketTokens.size() && bracketTokens.elementAt(bracketsTokensPos).equals("\n")) bracketsTokensPos++;
if (bracketsTokensPos < bracketTokens.size()) {
bracketTokensText.append(" ").append(bracketTokens.elementAt(bracketsTokensPos));
String currentLabel = bracketTokensTags.elementAt(bracketsTokensPos);
parsedTokensTagsFlat.addElement(currentLabel);
parsedTokensText.append(" ").append(parsedTokensFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
while ((!bracketTokensText.toString().equals(parsedTokensText.toString())) && parsedTokensPos < parsedTokensFlat.size()) {
if (currentLabel.startsWith("B-"))
parsedTokensTagsFlat.addElement("I-" + currentLabel.substring(2));
else
parsedTokensTagsFlat.addElement(currentLabel);
parsedTokensText.append(parsedTokensFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
}
if (!bracketTokensText.toString().equals(parsedTokensText.toString()))
throw new Exception("Error aligning raw brackets tokens to token/sentence split tokens\nBrackets token text till now:\n" + bracketTokensText + "\nTokenized text till now:\n" + parsedTokensText);
bracketsTokensPos++;
}
}
// ok, we're done, just building the output sentences
ArrayList<LinkedVector> res = new ArrayList<>();
parsedTokensPos = 0;
for (int i = 0; i < parsedTokens.size(); i++) {
LinkedVector sentence = new LinkedVector();
for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) {
NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), parsedTokensTagsFlat.elementAt(parsedTokensPos), cp);
parsedTokensPos++;
}
res.add(sentence);
}
return new NERDocument(res, docname);
}
Aggregations