use of edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader in project cogcomp-nlp by CogComp.
the class POSBaseLineCounter method buildTableHelper.
/**
* A table is built from a given source corpus file by simply counting the number of times that
* each form-POS association appear in a source corpus.
*
* @param fileName file name of the source corpus
* @throws Exception
**/
private void buildTableHelper(String fileName) {
PennTreebankPOSReader reader = new PennTreebankPOSReader(this.corpusName);
reader.readFile(fileName);
List<TextAnnotation> tas = reader.getTextAnnotations();
for (TextAnnotation ta : tas) {
for (int tokenId = 0; tokenId < ta.size(); tokenId++) {
count(ta.getToken(tokenId), ((SpanLabelView) ta.getView(ViewNames.POS)).getLabel(tokenId));
}
}
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader in project cogcomp-nlp by CogComp.
the class POSMikheevCounter method buildTableHelper.
/**
* A table is built from a given source corpus file by counting the number of times that each
* suffix-POS association in a source corpus.
*
* @param fileName file name of the source corpus
* @throws Exception
**/
private void buildTableHelper(String fileName) throws Exception {
PennTreebankPOSReader reader = new PennTreebankPOSReader(this.corpusName);
reader.readFile(fileName);
List<TextAnnotation> tas = reader.getTextAnnotations();
for (TextAnnotation ta : tas) {
for (int tokenId = 0; tokenId < ta.size(); tokenId++) {
String form = ta.getToken(tokenId);
String tag = ((SpanLabelView) ta.getView(ViewNames.POS)).getLabel(tokenId);
if (form.length() >= 5) {
boolean allLetters = true;
for (int i = form.length() - 3; i < form.length() && allLetters; ++i) allLetters = Character.isLetter(form.charAt(i));
if (allLetters) {
// Word w = (Word) example;
HashMap<String, TreeMap<String, Integer>> t = null;
if (WordHelpers.isCapitalized(ta, tokenId)) {
int headOfSentence = ta.getSentence(ta.getSentenceId(tokenId)).getStartSpan();
if (tokenId == headOfSentence)
t = firstCapitalized;
else
t = notFirstCapitalized;
} else {
if (form.contains("-"))
return;
t = table;
}
form = form.toLowerCase();
count(t, form.substring(form.length() - 3), tag);
if (form.length() >= 6 && Character.isLetter(form.charAt(form.length() - 4)))
count(t, form.substring(form.length() - 4), tag);
}
}
}
}
}
Aggregations