use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.
the class LBJavaUtils method recordToLBJTokens.
/**
* Converts a record into LBJ Tokens for use with LBJ classifiers. If part of speech is present
* in record, it is added to the LBJ tokens.
*/
public static List<Token> recordToLBJTokens(TextAnnotation record) {
List<Token> lbjTokens = new LinkedList<>();
List<List<String>> sentences = tokensAsStrings(record.getView(ViewNames.TOKENS).getConstituents(), record.getView(ViewNames.SENTENCE).getConstituents(), record.getText());
List<Constituent> tags = null;
if (record.hasView(ViewNames.POS))
tags = record.getView(ViewNames.POS).getConstituents();
int tagIndex = 0;
for (List<String> sentence : sentences) {
boolean opendblquote = true;
Word wprevious = null;
Token tprevious = null;
for (String token : sentence) {
if (token.equals("\"")) {
token = opendblquote ? "``" : "''";
opendblquote = !opendblquote;
} else if (token.equals("(")) {
token = "-LRB-";
} else if (token.equals(")")) {
token = "-RRB-";
} else if (token.equals("{")) {
token = "-LCB-";
} else if (token.equals("}")) {
token = "-RCB-";
} else if (token.equals("[")) {
token = "-LSB-";
} else if (token.equals("]")) {
token = "-RSB-";
}
Word wcurrent = new Word(token, wprevious);
if (null != tags && !tags.isEmpty()) {
Constituent tag = tags.get(tagIndex++);
wcurrent.partOfSpeech = tag.getLabel();
}
Token tcurrent = new Token(wcurrent, tprevious, "");
lbjTokens.add(tcurrent);
if (tprevious != null) {
tprevious.next = tcurrent;
}
wprevious = wcurrent;
tprevious = tcurrent;
}
}
return lbjTokens;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.
the class MikheevLearner method learn.
/**
* Trains the learning algorithm given an object as an example.
*
* @param example An example of the desired learned classifier's behavior.
**/
public void learn(Object example) {
String form = extractor.discreteValue(example);
String label = labeler.discreteValue(example);
if (form.length() >= 5) {
boolean allLetters = true;
for (int i = form.length() - 3; i < form.length() && allLetters; ++i) allLetters = Character.isLetter(form.charAt(i));
if (allLetters) {
Word w = (Word) example;
HashMap<String, TreeMap<String, Integer>> t = null;
if (w.capitalized) {
if (w.previous == null)
t = firstCapitalized;
else
t = notFirstCapitalized;
} else {
if (form.contains("-"))
return;
t = table;
}
form = form.toLowerCase();
increment(t, form.substring(form.length() - 3), label);
if (form.length() >= 6 && Character.isLetter(form.charAt(form.length() - 4)))
increment(t, form.substring(form.length() - 4), label);
}
}
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testWhitespaceBehavior.
/**
* Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
* xml markup has been replaced with whitespace of equal span.
*/
@Test
public void testWhitespaceBehavior() {
String origText = null;
try {
origText = LineIO.slurp(INFILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail(e.getMessage());
}
Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
Matcher xmlMatcher = xmlTagPattern.matcher(origText);
StringBuilder cleanTextBldr = new StringBuilder();
int lastAppendedCharOffset = 0;
while (xmlMatcher.find()) {
int start = xmlMatcher.start();
int end = xmlMatcher.end();
cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
lastAppendedCharOffset = end;
}
cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
String cleanText = cleanTextBldr.toString();
// count whitespace chars in string
// check token offsets in tokens returned by SentenceSplitter
Pattern sun = Pattern.compile("\\w*Sun\\w*");
Matcher sunMatcher = sun.matcher(cleanText);
Set<IntPair> sunSpans = new HashSet<>();
while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
Sentence[] sents = splitter.splitAll();
Sentence s = sents[0];
LinkedVector words = s.wordSplit();
for (int i = 0; i < words.size(); ++i) {
Word firstWord = (Word) words.get(0);
if ("Sun".equals(firstWord.form)) {
IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
assertTrue(sunSpans.contains(tokenCharOffsets));
}
}
StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
String tok = tokenInfo.getTokens()[i];
if (tok.equals("Sun")) {
IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
if (!sunSpans.contains(tokCharOffsets)) {
String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
}
assertTrue(sunSpans.contains(tokCharOffsets));
}
}
TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
assertNotNull(statefulTa);
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.
the class NETesterMultiDataset method reportPredictions.
public static void reportPredictions(Data dataSet, TestDiscrete resultsTokenLevel1, TestDiscrete resultsTokenLevel2, TestDiscrete resultsPhraseLevel1, TestDiscrete resultsPhraseLevel2, TestDiscrete resultsByBILOU, TestDiscrete resultsSegmentation) {
NELabel labeler = new NELabel();
Data dataCloneWithanonymizedLabels = new Data();
for (int docid = 0; docid < dataSet.documents.size(); docid++) {
ArrayList<LinkedVector> originalSentences = dataSet.documents.get(docid).sentences;
ArrayList<LinkedVector> clonedSentences = new ArrayList<>();
for (LinkedVector originalSentence : originalSentences) {
LinkedVector sentence = new LinkedVector();
for (int j = 0; j < originalSentence.size(); j++) {
NEWord originalW = (NEWord) originalSentence.get(j);
NEWord w = new NEWord(new Word(originalW.form), null, null);
w.neLabel = originalW.neLabel;
if (w.neLabel.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neLabel.substring(2)))
w.neLabel = "O";
w.neTypeLevel1 = originalW.neTypeLevel1;
if (w.neLabel.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neLabel.substring(2))) {
w.neLabel = w.neLabel.substring(0, 2) + "ENTITY";
// logger.info("replace!!!");
}
w.neTypeLevel1 = originalW.neTypeLevel1;
if (w.neTypeLevel1.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neTypeLevel1.substring(2)))
w.neTypeLevel1 = "O";
if (w.neTypeLevel1.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neTypeLevel1.substring(2)))
w.neTypeLevel1 = w.neTypeLevel1.substring(0, 2) + "ENTITY";
w.neTypeLevel2 = originalW.neTypeLevel2;
if (w.neTypeLevel2.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neTypeLevel2.substring(2)))
w.neTypeLevel2 = "O";
if (w.neTypeLevel2.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neTypeLevel2.substring(2)))
w.neTypeLevel2 = w.neTypeLevel2.substring(0, 2) + "ENTITY";
sentence.add(w);
}
clonedSentences.add(sentence);
}
NERDocument clonedDoc = new NERDocument(clonedSentences, "fake" + docid);
dataCloneWithanonymizedLabels.documents.add(clonedDoc);
}
for (int docid = 0; docid < dataCloneWithanonymizedLabels.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = dataCloneWithanonymizedLabels.documents.get(docid).sentences;
for (LinkedVector vector : sentences) {
int N = vector.size();
String[] predictionsLevel1 = new String[N], predictionsLevel2 = new String[N], labels = new String[N];
for (int i = 0; i < N; ++i) {
predictionsLevel1[i] = ((NEWord) vector.get(i)).neTypeLevel1;
predictionsLevel2[i] = ((NEWord) vector.get(i)).neTypeLevel2;
labels[i] = labeler.discreteValue(vector.get(i));
String pLevel1 = predictionsLevel1[i];
String pLevel2 = predictionsLevel2[i];
if (pLevel1.indexOf('-') > -1)
pLevel1 = pLevel1.substring(2);
if (pLevel2.indexOf('-') > -1)
pLevel2 = pLevel2.substring(2);
String l = labels[i];
if (l.indexOf('-') > -1)
l = l.substring(2);
resultsTokenLevel1.reportPrediction(pLevel1, l);
resultsTokenLevel2.reportPrediction(pLevel2, l);
}
// getting phrase level accuracy level1
for (int i = 0; i < N; ++i) {
String p = "O", l = "O";
int pEnd = -1, lEnd = -1;
if (predictionsLevel1[i].startsWith("B-") || predictionsLevel1[i].startsWith("I-") && (i == 0 || !predictionsLevel1[i - 1].endsWith(predictionsLevel1[i].substring(2)))) {
p = predictionsLevel1[i].substring(2);
pEnd = i;
while (pEnd + 1 < N && predictionsLevel1[pEnd + 1].equals("I-" + p)) ++pEnd;
}
if (labels[i].startsWith("B-")) {
l = labels[i].substring(2);
lEnd = i;
while (lEnd + 1 < N && labels[lEnd + 1].equals("I-" + l)) ++lEnd;
}
if (!p.equals("O") || !l.equals("O")) {
if (pEnd == lEnd)
resultsPhraseLevel1.reportPrediction(p, l);
else {
if (!p.equals("O"))
resultsPhraseLevel1.reportPrediction(p, "O");
if (!l.equals("O"))
resultsPhraseLevel1.reportPrediction("O", l);
}
}
}
// getting phrase level accuracy level2
for (int i = 0; i < N; ++i) {
String p = "O", l = "O";
int pEnd = -1, lEnd = -1;
if (predictionsLevel2[i].startsWith("B-") || predictionsLevel2[i].startsWith("I-") && (i == 0 || !predictionsLevel2[i - 1].endsWith(predictionsLevel2[i].substring(2)))) {
p = predictionsLevel2[i].substring(2);
pEnd = i;
while (pEnd + 1 < N && predictionsLevel2[pEnd + 1].equals("I-" + p)) ++pEnd;
}
if (labels[i].startsWith("B-")) {
l = labels[i].substring(2);
lEnd = i;
while (lEnd + 1 < N && labels[lEnd + 1].equals("I-" + l)) ++lEnd;
}
if (!p.equals("O") || !l.equals("O")) {
if (pEnd == lEnd)
resultsPhraseLevel2.reportPrediction(p, l);
else {
if (!p.equals("O"))
resultsPhraseLevel2.reportPrediction(p, "O");
if (!l.equals("O"))
resultsPhraseLevel2.reportPrediction("O", l);
}
}
}
}
}
TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, dataCloneWithanonymizedLabels, NEWord.LabelToLookAt.GoldLabel);
TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, dataCloneWithanonymizedLabels, NEWord.LabelToLookAt.PredictionLevel2Tagger);
for (int docid = 0; docid < dataCloneWithanonymizedLabels.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = dataCloneWithanonymizedLabels.documents.get(docid).sentences;
for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
NEWord w = (NEWord) sentence.get(j);
String bracketTypePrediction = w.neTypeLevel2;
if (bracketTypePrediction.indexOf('-') > 0)
bracketTypePrediction = bracketTypePrediction.substring(0, 1);
String bracketTypeLabel = w.neLabel;
if (bracketTypeLabel.indexOf('-') > 0)
bracketTypeLabel = bracketTypeLabel.substring(0, 1);
resultsByBILOU.reportPrediction(w.neTypeLevel2, w.neLabel);
resultsSegmentation.reportPrediction(bracketTypePrediction, bracketTypeLabel);
}
}
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.
the class NEWord method splitWord.
/*
* Used for some tokenization schemes.
*/
private static Vector<NEWord> splitWord(NEWord word) {
String[] sentence = { word.form + " " };
Parser parser = new WordSplitter(new SentenceSplitter(sentence));
LinkedVector words = (LinkedVector) parser.next();
Vector<NEWord> res = new Vector<>();
if (words == null) {
res.add(word);
return res;
}
String label = word.neLabel;
for (int i = 0; i < words.size(); i++) {
if (label.contains("B-") && i > 0)
label = "I-" + label.substring(2);
NEWord w = new NEWord(new Word(((Word) words.get(i)).form), null, label);
res.addElement(w);
}
return res;
}
Aggregations