use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class Treebanks method countTaggings.
private static void countTaggings(Treebank tb, final PrintWriter pw) {
final TwoDimensionalCounter<String, String> wtc = new TwoDimensionalCounter<>();
tb.apply(tree -> {
List<TaggedWord> tags = tree.taggedYield();
for (TaggedWord tag : tags) wtc.incrementCount(tag.word(), tag.tag());
});
for (String key : wtc.firstKeySet()) {
pw.print(key);
pw.print('\t');
Counter<String> ctr = wtc.getCounter(key);
for (String k2 : ctr.keySet()) {
pw.print(k2 + '\t' + ctr.getCount(k2) + '\t');
}
pw.println();
}
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class CollocationFinder method mergeLeavesIntoCollocatedString.
private static String mergeLeavesIntoCollocatedString(Tree t) {
StringBuilder sb = new StringBuilder(160);
ArrayList<TaggedWord> sent = t.taggedYield();
for (TaggedWord aSent : sent) {
sb.append(aSent.word()).append('_');
}
return sb.substring(0, sb.length() - 1);
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ParserGrammar method lemmatize.
/**
* Only works on English, as it is hard coded for using the
* Morphology class, which is English-only
*/
public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) {
List<TaggedWord> tagged;
if (getOp().testOptions.preTag) {
Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger();
tagged = tagger.apply(tokens);
} else {
Tree tree = parse(tokens);
tagged = tree.taggedYield();
}
Morphology morpha = new Morphology();
List<CoreLabel> lemmas = Generics.newArrayList();
for (TaggedWord token : tagged) {
CoreLabel label = new CoreLabel();
label.setWord(token.word());
label.setTag(token.tag());
morpha.stem(label);
lemmas.add(label);
}
return lemmas;
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseCharacterBasedLexiconTraining method printStats.
public static void printStats(Collection<Tree> trees, PrintWriter pw) {
ClassicCounter<Integer> wordLengthCounter = new ClassicCounter<>();
ClassicCounter<TaggedWord> wordCounter = new ClassicCounter<>();
ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
int counter = 0;
for (Tree tree : trees) {
counter++;
List<TaggedWord> taggedWords = tree.taggedYield();
for (TaggedWord taggedWord : taggedWords) {
String word = taggedWord.word();
if (word.equals(Lexicon.BOUNDARY)) {
continue;
}
wordCounter.incrementCount(taggedWord);
wordLengthCounter.incrementCount(Integer.valueOf(word.length()));
for (int j = 0, length = word.length(); j < length; j++) {
Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
charCounter.incrementCount(sym);
}
charCounter.incrementCount(Symbol.END_WORD);
}
}
Set<Symbol> singletonChars = Counters.keysBelow(charCounter, 1.5);
Set<TaggedWord> singletonWords = Counters.keysBelow(wordCounter, 1.5);
ClassicCounter<String> singletonWordPOSes = new ClassicCounter<>();
for (TaggedWord taggedWord : singletonWords) {
singletonWordPOSes.incrementCount(taggedWord.tag());
}
Distribution<String> singletonWordPOSDist = Distribution.getDistribution(singletonWordPOSes);
ClassicCounter<Character> singletonCharRads = new ClassicCounter<>();
for (Symbol s : singletonChars) {
singletonCharRads.incrementCount(Character.valueOf(RadicalMap.getRadical(s.getCh())));
}
Distribution<Character> singletonCharRadDist = Distribution.getDistribution(singletonCharRads);
Distribution<Integer> wordLengthDist = Distribution.getDistribution(wordLengthCounter);
NumberFormat percent = new DecimalFormat("##.##%");
pw.println("There are " + singletonChars.size() + " singleton chars out of " + (int) charCounter.totalCount() + " tokens and " + charCounter.size() + " types found in " + counter + " trees.");
pw.println("Thus singletonChars comprise " + percent.format(singletonChars.size() / charCounter.totalCount()) + " of tokens and " + percent.format((double) singletonChars.size() / charCounter.size()) + " of types.");
pw.println();
pw.println("There are " + singletonWords.size() + " singleton words out of " + (int) wordCounter.totalCount() + " tokens and " + wordCounter.size() + " types.");
pw.println("Thus singletonWords comprise " + percent.format(singletonWords.size() / wordCounter.totalCount()) + " of tokens and " + percent.format((double) singletonWords.size() / wordCounter.size()) + " of types.");
pw.println();
pw.println("Distribution over singleton word POS:");
pw.println(singletonWordPOSDist.toString());
pw.println();
pw.println("Distribution over singleton char radicals:");
pw.println(singletonCharRadDist.toString());
pw.println();
pw.println("Distribution over word length:");
pw.println(wordLengthDist);
}
use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method finishTraining.
@Override
public void finishTraining() {
IntCounter<String> tagCounter = new IntCounter<>();
WeightedDataset data = new WeightedDataset(datumCounter.size());
for (TaggedWord word : datumCounter.keySet()) {
int count = datumCounter.getIntCount(word);
if (trainOnLowCount && count > trainCountThreshold) {
continue;
}
if (functionWordTags.containsKey(word.word())) {
continue;
}
tagCounter.incrementCount(word.tag());
if (trainByType) {
count = 1;
}
data.add(new BasicDatum(featExtractor.makeFeatures(word.word()), word.tag()), count);
}
datumCounter = null;
tagDist = Distribution.laplaceSmoothedDistribution(tagCounter, tagCounter.size(), 0.5);
tagCounter = null;
applyThresholds(data);
verbose("Making classifier...");
//new ResultStoringMonitor(5, "weights"));
QNMinimizer minim = new QNMinimizer();
// minim.shutUp();
LinearClassifierFactory factory = new LinearClassifierFactory(minim);
factory.setTol(tol);
factory.setSigma(sigma);
if (tuneSigma) {
factory.setTuneSigmaHeldOut();
}
scorer = factory.trainClassifier(data);
verbose("Done training.");
}
Aggregations