use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class GrammarCompactor method convertGraphsToGrammar.
/**
* @param graphs a Map from String categories to TransducerGraph objects
* @param unaryRules is a Set of UnaryRule objects that we need to add
* @param binaryRules is a Set of BinaryRule objects that we need to add
* @return a new Pair of UnaryGrammar, BinaryGrammar
*/
protected Pair<UnaryGrammar, BinaryGrammar> convertGraphsToGrammar(Set<TransducerGraph> graphs, Set<UnaryRule> unaryRules, Set<BinaryRule> binaryRules) {
// first go through all the existing rules and number them with new numberer
newStateIndex = new HashIndex<>();
for (UnaryRule rule : unaryRules) {
String parent = stateIndex.get(rule.parent);
rule.parent = newStateIndex.addToIndex(parent);
String child = stateIndex.get(rule.child);
rule.child = newStateIndex.addToIndex(child);
}
for (BinaryRule rule : binaryRules) {
String parent = stateIndex.get(rule.parent);
rule.parent = newStateIndex.addToIndex(parent);
String leftChild = stateIndex.get(rule.leftChild);
rule.leftChild = newStateIndex.addToIndex(leftChild);
String rightChild = stateIndex.get(rule.rightChild);
rule.rightChild = newStateIndex.addToIndex(rightChild);
}
// now go through the graphs and add the rules
for (TransducerGraph graph : graphs) {
Object startNode = graph.getStartNode();
for (Arc arc : graph.getArcs()) {
// TODO: make sure these are the strings we're looking for
String source = arc.getSourceNode().toString();
String target = arc.getTargetNode().toString();
Object input = arc.getInput();
String inputString = input.toString();
double output = ((Double) arc.getOutput()).doubleValue();
if (source.equals(startNode)) {
// make a UnaryRule
UnaryRule ur = new UnaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(inputString), smartNegate(output));
unaryRules.add(ur);
} else if (inputString.equals(END) || inputString.equals(EPSILON)) {
// make a UnaryRule
UnaryRule ur = new UnaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(source), smartNegate(output));
unaryRules.add(ur);
} else {
// make a BinaryRule
// figure out whether the input was generated on the left or right
int length = inputString.length();
char leftOrRight = inputString.charAt(length - 1);
inputString = inputString.substring(0, length - 1);
BinaryRule br;
if (leftOrRight == '<' || leftOrRight == '[') {
br = new BinaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(inputString), newStateIndex.addToIndex(source), smartNegate(output));
} else if (leftOrRight == '>' || leftOrRight == ']') {
br = new BinaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(source), newStateIndex.addToIndex(inputString), smartNegate(output));
} else {
throw new RuntimeException("Arc input is in unexpected format: " + arc);
}
binaryRules.add(br);
}
}
}
// by now, the unaryRules and binaryRules Sets have old untouched and new rules with scores
ClassicCounter<String> symbolCounter = new ClassicCounter<>();
if (outputType == RAW_COUNTS) {
// so we count parent symbol occurrences
for (UnaryRule rule : unaryRules) {
symbolCounter.incrementCount(newStateIndex.get(rule.parent), rule.score);
}
for (BinaryRule rule : binaryRules) {
symbolCounter.incrementCount(newStateIndex.get(rule.parent), rule.score);
}
}
// now we put the rules in the grammars
// this should be smaller than last one
int numStates = newStateIndex.size();
int numRules = 0;
UnaryGrammar ug = new UnaryGrammar(newStateIndex);
BinaryGrammar bg = new BinaryGrammar(newStateIndex);
for (UnaryRule rule : unaryRules) {
if (outputType == RAW_COUNTS) {
double count = symbolCounter.getCount(newStateIndex.get(rule.parent));
rule.score = (float) Math.log(rule.score / count);
}
ug.addRule(rule);
numRules++;
}
for (BinaryRule rule : binaryRules) {
if (outputType == RAW_COUNTS) {
double count = symbolCounter.getCount(newStateIndex.get(rule.parent));
rule.score = (float) Math.log((rule.score - op.trainOptions.ruleDiscount) / count);
}
bg.addRule(rule);
numRules++;
}
if (verbose) {
System.out.println("Number of minimized rules: " + numRules);
System.out.println("Number of minimized states: " + newStateIndex.size());
}
ug.purgeRules();
bg.splitRules();
return new Pair<>(ug, bg);
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class ChineseCorefBenchmarkSlowITest method getCorefResults.
private static Counter<String> getCorefResults(String resultsString) throws IOException {
Counter<String> results = new ClassicCounter<String>();
BufferedReader r = new BufferedReader(new StringReader(resultsString));
for (String line; (line = r.readLine()) != null; ) {
Matcher m1 = MENTION_PATTERN.matcher(line);
if (m1.matches()) {
results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
}
Matcher m2 = MUC_PATTERN.matcher(line);
if (m2.matches()) {
results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
}
Matcher m3 = BCUBED_PATTERN.matcher(line);
if (m3.matches()) {
results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
}
Matcher m4 = CEAFM_PATTERN.matcher(line);
if (m4.matches()) {
results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
}
Matcher m5 = CEAFE_PATTERN.matcher(line);
if (m5.matches()) {
results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
}
Matcher m6 = BLANC_PATTERN.matcher(line);
if (m6.matches()) {
results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
}
Matcher m7 = CONLL_PATTERN.matcher(line);
if (m7.matches()) {
results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
}
}
return results;
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class DcorefBenchmarkSlowITest method getCorefResults.
public static Counter<String> getCorefResults(String resultsString) throws IOException {
Counter<String> results = new ClassicCounter<>();
BufferedReader r = new BufferedReader(new StringReader(resultsString));
for (String line; (line = r.readLine()) != null; ) {
Matcher m1 = MENTION_PATTERN.matcher(line);
if (m1.matches()) {
results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
}
Matcher m2 = MUC_PATTERN.matcher(line);
if (m2.matches()) {
results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
}
Matcher m3 = BCUBED_PATTERN.matcher(line);
if (m3.matches()) {
results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
}
Matcher m4 = CEAFM_PATTERN.matcher(line);
if (m4.matches()) {
results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
}
Matcher m5 = CEAFE_PATTERN.matcher(line);
if (m5.matches()) {
results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
}
Matcher m6 = BLANC_PATTERN.matcher(line);
if (m6.matches()) {
results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
}
Matcher m7 = CONLL_PATTERN.matcher(line);
if (m7.matches()) {
results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
}
}
return results;
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class Treebank method textualSummary.
/**
* Return various statistics about the treebank (number of sentences,
* words, tag set, etc.).
*
* @param tlp The TreebankLanguagePack used to determine punctuation and an
* appropriate character encoding
* @return A big string for human consumption describing the treebank
*/
public String textualSummary(TreebankLanguagePack tlp) {
int numTrees = 0;
int numTreesLE40 = 0;
int numNonUnaryRoots = 0;
Tree nonUnaryEg = null;
ClassicCounter<Tree> nonUnaries = new ClassicCounter<>();
ClassicCounter<String> roots = new ClassicCounter<>();
ClassicCounter<String> starts = new ClassicCounter<>();
ClassicCounter<String> puncts = new ClassicCounter<>();
int numUnenclosedLeaves = 0;
int numLeaves = 0;
int numNonPhrasal = 0;
int numPreTerminalWithMultipleChildren = 0;
int numWords = 0;
int numTags = 0;
int shortestSentence = Integer.MAX_VALUE;
int longestSentence = 0;
int numNullLabel = 0;
Set<String> words = Generics.newHashSet();
ClassicCounter<String> tags = new ClassicCounter<>();
ClassicCounter<String> cats = new ClassicCounter<>();
Tree leafEg = null;
Tree preTerminalMultipleChildrenEg = null;
Tree nullLabelEg = null;
Tree rootRewritesAsTaggedWordEg = null;
for (Tree t : this) {
roots.incrementCount(t.value());
numTrees++;
int leng = t.yield().size();
if (leng <= 40) {
numTreesLE40++;
}
if (leng < shortestSentence) {
shortestSentence = leng;
}
if (leng > longestSentence) {
longestSentence = leng;
}
if (t.numChildren() > 1) {
if (numNonUnaryRoots == 0) {
nonUnaryEg = t;
}
if (numNonUnaryRoots < 100) {
nonUnaries.incrementCount(t.localTree());
}
numNonUnaryRoots++;
} else if (t.isLeaf()) {
numUnenclosedLeaves++;
} else {
Tree t2 = t.firstChild();
if (t2.isLeaf()) {
numLeaves++;
leafEg = t;
} else if (t2.isPreTerminal()) {
if (numNonPhrasal == 0) {
rootRewritesAsTaggedWordEg = t;
}
numNonPhrasal++;
}
starts.incrementCount(t2.value());
}
for (Tree subtree : t) {
Label lab = subtree.label();
if (lab == null || lab.value() == null || "".equals(lab.value())) {
if (numNullLabel == 0) {
nullLabelEg = subtree;
}
numNullLabel++;
if (lab == null) {
subtree.setLabel(new StringLabel(""));
} else if (lab.value() == null) {
subtree.label().setValue("");
}
}
if (subtree.isLeaf()) {
numWords++;
words.add(subtree.value());
} else if (subtree.isPreTerminal()) {
numTags++;
tags.incrementCount(subtree.value());
if (tlp != null && tlp.isPunctuationTag(subtree.value())) {
puncts.incrementCount(subtree.firstChild().value());
}
} else if (subtree.isPhrasal()) {
boolean hasLeafChild = false;
for (Tree kt : subtree.children()) {
if (kt.isLeaf()) {
hasLeafChild = true;
}
}
if (hasLeafChild) {
numPreTerminalWithMultipleChildren++;
if (preTerminalMultipleChildrenEg == null) {
preTerminalMultipleChildrenEg = subtree;
}
}
cats.incrementCount(subtree.value());
} else {
throw new IllegalStateException("Treebank: Bad tree in treebank!: " + subtree);
}
}
}
StringWriter sw = new StringWriter(2000);
PrintWriter pw = new PrintWriter(sw);
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(0);
pw.println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
if (numTrees > 0) {
if (numTags != numWords) {
pw.println(" Warning! numTags differs and is " + numTags);
}
if (roots.size() == 1) {
String root = (String) roots.keySet().toArray()[0];
pw.println(" The root category is: " + root);
} else {
pw.println(" Warning! " + roots.size() + " different roots in treebank: " + Counters.toString(roots, nf));
}
if (numNonUnaryRoots > 0) {
pw.print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. ");
if (numNonUnaryRoots > 100) {
pw.print("First 100 ");
}
pw.println("Rewrites: " + Counters.toString(nonUnaries, nf));
pw.println(" Example: " + nonUnaryEg);
}
if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) {
pw.println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
if (numLeaves > 0) {
pw.println(" Example bad root rewrites as leaf: " + leafEg);
}
if (numNonPhrasal > 0) {
pw.println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
}
}
if (numNullLabel > 0) {
pw.println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
pw.println(" " + nullLabelEg);
}
if (numPreTerminalWithMultipleChildren > 0) {
pw.println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
pw.println(" Example: " + preTerminalMultipleChildrenEg);
}
pw.println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
pw.println(" " + cats.size() + " phrasal category types, " + tags.size() + " tag types, and " + words.size() + " word types");
String[] empties = { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
// What a dopey choice using 0 as an empty element name!!
// The problem with the below is that words aren't turned into a basic
// category, but empties commonly are indexed.... Would need to look
// for them with a suffix of -[0-9]+
Set<String> knownEmpties = Generics.newHashSet(Arrays.asList(empties));
Set<String> emptiesIntersection = Sets.intersection(words, knownEmpties);
if (!emptiesIntersection.isEmpty()) {
pw.println(" Caution! " + emptiesIntersection.size() + " word types are known empty elements: " + emptiesIntersection);
}
Set<String> joint = Sets.intersection(cats.keySet(), tags.keySet());
if (!joint.isEmpty()) {
pw.println(" Warning! " + joint.size() + " items are tags and categories: " + joint);
}
for (String cat : cats.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat);
break;
}
}
for (String cat : tags.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat);
break;
}
}
pw.println(" Cats: " + Counters.toString(cats, nf));
pw.println(" Tags: " + Counters.toString(tags, nf));
pw.println(" " + starts.size() + " start categories: " + Counters.toString(starts, nf));
if (!puncts.isEmpty()) {
pw.println(" Puncts: " + Counters.toString(puncts, nf));
}
}
return sw.toString();
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class SimpleSentiment method featurize.
/**
* Featurize a given sentence.
*
* @param sentence The sentence to featurize.
*
* @return A counter encoding the featurized sentence.
*/
private static Counter<String> featurize(CoreMap sentence) {
ClassicCounter<String> features = new ClassicCounter<>();
String lastLemma = "^";
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String lemma = token.lemma().toLowerCase();
if (number.matcher(lemma).matches()) {
features.incrementCount("**num**");
} else {
features.incrementCount(lemma);
}
if (alpha.matcher(lemma).matches()) {
features.incrementCount(lastLemma + "__" + lemma);
lastLemma = lemma;
}
}
features.incrementCount(lastLemma + "__$");
return features;
}
Aggregations