use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class WordTopicAndLayoutFeatures method addDatasets.
/*
* Note- this assumes that the data is split by documents. So if we choose to ignore the
* document boundaries, we're in trouble!!!
*/
public static void addDatasets(Vector<LinkedVector> sentences, boolean lowercaseData, double confidenceThreshold) throws Exception {
if (nb == null || map == null)
throw new Exception("Topic classifier not initialized!!!");
String documentText = "";
Vector<NEWord> docWords = new Vector<>();
for (int sid = 0; sid < sentences.size(); sid++) {
LinkedVector s = sentences.elementAt(sid);
for (int i = 0; i < s.size(); i++) {
documentText += " " + ((NEWord) s.get(i)).originalForm + " ";
docWords.addElement((NEWord) s.get(i));
}
if (((NEWord) s.get(s.size() - 1)).nextIgnoreSentenceBoundary == null) {
// this is the last sentence in the document- move on!
if (lowercaseData)
documentText = documentText.toLowerCase();
Document doc = new Document(InFile.tokenize(documentText, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"), -1);
int label = nb.classify(doc, confidenceThreshold);
logger.info("*********************\n" + labelnames[label + 1] + "\n*********************\n" + documentText.substring(0, Math.min(documentText.length(), 400)));
for (int i = 0; i < docWords.size(); i++) wordToTopicIdMap.put(docWords.elementAt(i), label);
documentText = "";
docWords = new Vector<>();
}
}
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class ReferenceUtils method createNerDataStructuresForText.
public Data createNerDataStructuresForText(TextAnnotation ta) {
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();
int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
String[] wtoks = sentence.getTokens();
LinkedVector words = new LinkedVector();
for (String w : wtoks) {
if (w.length() > 0) {
NEWord.addTokenToSentence(words, w, "unlabeled");
tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
throw new IllegalStateException("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
// Do the annotation.
Data data = new Data(new NERDocument(sentences, "input"));
return data;
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class TaggedDataWriter method toColumnsFormat.
private static String toColumnsFormat(Data data, NEWord.LabelToLookAt labelType) {
StringBuilder res = new StringBuilder(data.documents.size() * 1000);
for (int did = 0; did < data.documents.size(); did++) {
for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
LinkedVector vector = data.documents.get(did).sentences.get(i);
if (((NEWord) vector.get(0)).previousIgnoreSentenceBoundary == null)
res.append("O 0 0 O -X- -DOCSTART- x x 0\n\n");
for (int j = 0; j < vector.size(); j++) {
NEWord w = (NEWord) vector.get(j);
res.append(w.getPrediction(labelType)).append("\t0\t").append(j).append("\tO\tO\t").append(w.form).append("\tx\tx\t0\n");
}
res.append("\n");
}
}
return res.toString();
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testWhitespaceBehavior.
/**
* Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
* xml markup has been replaced with whitespace of equal span.
*/
@Test
public void testWhitespaceBehavior() {
String origText = null;
try {
origText = LineIO.slurp(INFILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail(e.getMessage());
}
Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
Matcher xmlMatcher = xmlTagPattern.matcher(origText);
StringBuilder cleanTextBldr = new StringBuilder();
int lastAppendedCharOffset = 0;
while (xmlMatcher.find()) {
int start = xmlMatcher.start();
int end = xmlMatcher.end();
cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
lastAppendedCharOffset = end;
}
cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
String cleanText = cleanTextBldr.toString();
// count whitespace chars in string
// check token offsets in tokens returned by SentenceSplitter
Pattern sun = Pattern.compile("\\w*Sun\\w*");
Matcher sunMatcher = sun.matcher(cleanText);
Set<IntPair> sunSpans = new HashSet<>();
while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
Sentence[] sents = splitter.splitAll();
Sentence s = sents[0];
LinkedVector words = s.wordSplit();
for (int i = 0; i < words.size(); ++i) {
Word firstWord = (Word) words.get(0);
if ("Sun".equals(firstWord.form)) {
IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
assertTrue(sunSpans.contains(tokenCharOffsets));
}
}
StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
String tok = tokenInfo.getTokens()[i];
if (tok.equals("Sun")) {
IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
if (!sunSpans.contains(tokCharOffsets)) {
String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
}
assertTrue(sunSpans.contains(tokCharOffsets));
}
}
TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
assertNotNull(statefulTa);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class NETagPlain method tagData.
public static String tagData(Data data, NETaggerLevel1 tagger1, NETaggerLevel2 tagger2) throws Exception {
ExpressiveFeaturesAnnotator.annotate(data);
Decoder.annotateDataBIO(data, tagger1, tagger2);
StringBuffer res = new StringBuffer();
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector vector : sentences) {
boolean open = false;
String[] predictions = new String[vector.size()];
String[] words = new String[vector.size()];
for (int j = 0; j < vector.size(); j++) {
predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
words[j] = ((NEWord) vector.get(j)).form;
}
for (int j = 0; j < vector.size(); j++) {
if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
res.append("[").append(predictions[j].substring(2)).append(" ");
open = true;
}
res.append(words[j]).append(" ");
if (open) {
boolean close = false;
if (j == vector.size() - 1) {
close = true;
} else {
if (predictions[j + 1].startsWith("B-"))
close = true;
if (predictions[j + 1].equals("O"))
close = true;
if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
close = true;
}
if (close) {
// SWM: makes the output a little cleaner
String str_res = res.toString().trim();
res = new StringBuffer(str_res);
res.append("] ");
open = false;
}
}
}
}
}
return res.toString();
}
Aggregations