use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.
the class LBJavaUtils method recordToLBJTokens.
/**
* Converts a record into LBJ Tokens for use with LBJ classifiers. If part of speech is present
* in record, it is added to the LBJ tokens.
*/
public static List<Token> recordToLBJTokens(TextAnnotation record) {
List<Token> lbjTokens = new LinkedList<>();
List<List<String>> sentences = tokensAsStrings(record.getView(ViewNames.TOKENS).getConstituents(), record.getView(ViewNames.SENTENCE).getConstituents(), record.getText());
List<Constituent> tags = null;
if (record.hasView(ViewNames.POS))
tags = record.getView(ViewNames.POS).getConstituents();
int tagIndex = 0;
for (List<String> sentence : sentences) {
boolean opendblquote = true;
Word wprevious = null;
Token tprevious = null;
for (String token : sentence) {
if (token.equals("\"")) {
token = opendblquote ? "``" : "''";
opendblquote = !opendblquote;
} else if (token.equals("(")) {
token = "-LRB-";
} else if (token.equals(")")) {
token = "-RRB-";
} else if (token.equals("{")) {
token = "-LCB-";
} else if (token.equals("}")) {
token = "-RCB-";
} else if (token.equals("[")) {
token = "-LSB-";
} else if (token.equals("]")) {
token = "-RSB-";
}
Word wcurrent = new Word(token, wprevious);
if (null != tags && !tags.isEmpty()) {
Constituent tag = tags.get(tagIndex++);
wcurrent.partOfSpeech = tag.getLabel();
}
Token tcurrent = new Token(wcurrent, tprevious, "");
lbjTokens.add(tcurrent);
if (tprevious != null) {
tprevious.next = tcurrent;
}
wprevious = wcurrent;
tprevious = tcurrent;
}
}
return lbjTokens;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.
the class TestPOSModels method testAccuracy.
/**
* Tags the unlabeled data and compares the part-of-speech tags with the labeled data, keeping
* track of and reporting total accuracy at the end.
*/
public void testAccuracy() {
WordForm __wordForm = new WordForm();
Parser labeledParser = new POSBracketToToken(labeledTestFile);
int numSeen = 0;
int numEqual = 0;
Token labeledWord = (Token) labeledParser.next();
for (; labeledWord != null; labeledWord = (Token) labeledParser.next()) {
String labeledTag = labeledWord.label;
String testTag = tagger.discreteValue(labeledWord);
if (labeledTag.equals(testTag)) {
numEqual++;
}
numSeen++;
}
System.out.println("Total accuracy over " + numSeen + " items: " + String.format("%.2f", 100.0 * (double) numEqual / (double) numSeen) + "%");
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.
the class POSTaggerUnknown method cachedFeatureValue.
private Feature cachedFeatureValue(Object __example) {
Token w = (Token) __example;
String __cachedValue = w.partOfSpeech;
if (__cachedValue != null) {
return new DiscretePrimitiveStringFeature(containingPackage, name, "", __cachedValue, valueIndexOf(__cachedValue), (short) allowableValues().length);
}
Feature __result;
__result = valueOf(w, __MikheevTable.allowableTags(w));
w.partOfSpeech = __result.getStringValue();
return __result;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.
the class POSWindow method classify.
public FeatureVector classify(Object __example) {
if (!(__example instanceof Token)) {
String type = __example == null ? "null" : __example.getClass().getName();
System.err.println("Classifier 'POSWindow(Token)' defined on line 31 of POS.lbj received '" + type + "' as input.");
new Exception().printStackTrace();
System.exit(1);
}
Token word = (Token) __example;
FeatureVector __result;
__result = new FeatureVector();
String __id;
String __value;
int i;
Token w = word, last = word;
for (i = 0; i <= 2 && last != null; ++i) {
last = (Token) last.next;
}
for (i = 0; i > -2 && w.previous != null; --i) {
w = (Token) w.previous;
}
for (; w != last; w = (Token) w.next) {
__id = "" + (i++);
__value = "" + (__POSTagger.discreteValue(w));
__result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
}
return __result;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.
the class POSAnnotator method addView.
/**
* annotates TextAnnotation with POS view and adds it to the TextAnnotation.
*
* @param record TextAnnotation to annotate
*/
@Override
public void addView(TextAnnotation record) throws AnnotatorException {
if (!record.hasView(tokensfield) && !record.hasView(sentencesfield)) {
throw new AnnotatorException("Record must be tokenized and sentence split first");
}
long startTime = System.currentTimeMillis();
List<Token> input = LBJavaUtils.recordToLBJTokens(record);
List<Constituent> tokens = record.getView(ViewNames.TOKENS).getConstituents();
TokenLabelView posView = new TokenLabelView(ViewNames.POS, getAnnotatorName(), record, 1.0);
int tcounter = 0;
for (Token lbjtoken : input) {
tagger.discreteValue(lbjtoken);
Constituent token = tokens.get(tcounter);
Constituent label = new Constituent(tagger.discreteValue(lbjtoken), ViewNames.POS, record, token.getStartSpan(), token.getEndSpan());
posView.addConstituent(label);
tcounter++;
}
long endTime = System.currentTimeMillis();
logger.debug("Tagged input in {}ms", endTime - startTime);
record.addView(ViewNames.POS, posView);
}
Aggregations