use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class DcorefBenchmarkSlowITest method getCorefResults.
public static Counter<String> getCorefResults(String resultsString) throws IOException {
Counter<String> results = new ClassicCounter<>();
BufferedReader r = new BufferedReader(new StringReader(resultsString));
for (String line; (line = r.readLine()) != null; ) {
Matcher m1 = MENTION_PATTERN.matcher(line);
if (m1.matches()) {
results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
}
Matcher m2 = MUC_PATTERN.matcher(line);
if (m2.matches()) {
results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
}
Matcher m3 = BCUBED_PATTERN.matcher(line);
if (m3.matches()) {
results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
}
Matcher m4 = CEAFM_PATTERN.matcher(line);
if (m4.matches()) {
results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
}
Matcher m5 = CEAFE_PATTERN.matcher(line);
if (m5.matches()) {
results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
}
Matcher m6 = BLANC_PATTERN.matcher(line);
if (m6.matches()) {
results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
}
Matcher m7 = CONLL_PATTERN.matcher(line);
if (m7.matches()) {
results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
}
}
return results;
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class Treebank method textualSummary.
/**
* Return various statistics about the treebank (number of sentences,
* words, tag set, etc.).
*
* @param tlp The TreebankLanguagePack used to determine punctuation and an
* appropriate character encoding
* @return A big string for human consumption describing the treebank
*/
public String textualSummary(TreebankLanguagePack tlp) {
int numTrees = 0;
int numTreesLE40 = 0;
int numNonUnaryRoots = 0;
Tree nonUnaryEg = null;
ClassicCounter<Tree> nonUnaries = new ClassicCounter<>();
ClassicCounter<String> roots = new ClassicCounter<>();
ClassicCounter<String> starts = new ClassicCounter<>();
ClassicCounter<String> puncts = new ClassicCounter<>();
int numUnenclosedLeaves = 0;
int numLeaves = 0;
int numNonPhrasal = 0;
int numPreTerminalWithMultipleChildren = 0;
int numWords = 0;
int numTags = 0;
int shortestSentence = Integer.MAX_VALUE;
int longestSentence = 0;
int numNullLabel = 0;
Set<String> words = Generics.newHashSet();
ClassicCounter<String> tags = new ClassicCounter<>();
ClassicCounter<String> cats = new ClassicCounter<>();
Tree leafEg = null;
Tree preTerminalMultipleChildrenEg = null;
Tree nullLabelEg = null;
Tree rootRewritesAsTaggedWordEg = null;
for (Tree t : this) {
roots.incrementCount(t.value());
numTrees++;
int leng = t.yield().size();
if (leng <= 40) {
numTreesLE40++;
}
if (leng < shortestSentence) {
shortestSentence = leng;
}
if (leng > longestSentence) {
longestSentence = leng;
}
if (t.numChildren() > 1) {
if (numNonUnaryRoots == 0) {
nonUnaryEg = t;
}
if (numNonUnaryRoots < 100) {
nonUnaries.incrementCount(t.localTree());
}
numNonUnaryRoots++;
} else if (t.isLeaf()) {
numUnenclosedLeaves++;
} else {
Tree t2 = t.firstChild();
if (t2.isLeaf()) {
numLeaves++;
leafEg = t;
} else if (t2.isPreTerminal()) {
if (numNonPhrasal == 0) {
rootRewritesAsTaggedWordEg = t;
}
numNonPhrasal++;
}
starts.incrementCount(t2.value());
}
for (Tree subtree : t) {
Label lab = subtree.label();
if (lab == null || lab.value() == null || "".equals(lab.value())) {
if (numNullLabel == 0) {
nullLabelEg = subtree;
}
numNullLabel++;
if (lab == null) {
subtree.setLabel(new StringLabel(""));
} else if (lab.value() == null) {
subtree.label().setValue("");
}
}
if (subtree.isLeaf()) {
numWords++;
words.add(subtree.value());
} else if (subtree.isPreTerminal()) {
numTags++;
tags.incrementCount(subtree.value());
if (tlp != null && tlp.isPunctuationTag(subtree.value())) {
puncts.incrementCount(subtree.firstChild().value());
}
} else if (subtree.isPhrasal()) {
boolean hasLeafChild = false;
for (Tree kt : subtree.children()) {
if (kt.isLeaf()) {
hasLeafChild = true;
}
}
if (hasLeafChild) {
numPreTerminalWithMultipleChildren++;
if (preTerminalMultipleChildrenEg == null) {
preTerminalMultipleChildrenEg = subtree;
}
}
cats.incrementCount(subtree.value());
} else {
throw new IllegalStateException("Treebank: Bad tree in treebank!: " + subtree);
}
}
}
StringWriter sw = new StringWriter(2000);
PrintWriter pw = new PrintWriter(sw);
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(0);
pw.println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
if (numTrees > 0) {
if (numTags != numWords) {
pw.println(" Warning! numTags differs and is " + numTags);
}
if (roots.size() == 1) {
String root = (String) roots.keySet().toArray()[0];
pw.println(" The root category is: " + root);
} else {
pw.println(" Warning! " + roots.size() + " different roots in treebank: " + Counters.toString(roots, nf));
}
if (numNonUnaryRoots > 0) {
pw.print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. ");
if (numNonUnaryRoots > 100) {
pw.print("First 100 ");
}
pw.println("Rewrites: " + Counters.toString(nonUnaries, nf));
pw.println(" Example: " + nonUnaryEg);
}
if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) {
pw.println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
if (numLeaves > 0) {
pw.println(" Example bad root rewrites as leaf: " + leafEg);
}
if (numNonPhrasal > 0) {
pw.println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
}
}
if (numNullLabel > 0) {
pw.println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
pw.println(" " + nullLabelEg);
}
if (numPreTerminalWithMultipleChildren > 0) {
pw.println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
pw.println(" Example: " + preTerminalMultipleChildrenEg);
}
pw.println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
pw.println(" " + cats.size() + " phrasal category types, " + tags.size() + " tag types, and " + words.size() + " word types");
String[] empties = { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
// What a dopey choice using 0 as an empty element name!!
// The problem with the below is that words aren't turned into a basic
// category, but empties commonly are indexed.... Would need to look
// for them with a suffix of -[0-9]+
Set<String> knownEmpties = Generics.newHashSet(Arrays.asList(empties));
Set<String> emptiesIntersection = Sets.intersection(words, knownEmpties);
if (!emptiesIntersection.isEmpty()) {
pw.println(" Caution! " + emptiesIntersection.size() + " word types are known empty elements: " + emptiesIntersection);
}
Set<String> joint = Sets.intersection(cats.keySet(), tags.keySet());
if (!joint.isEmpty()) {
pw.println(" Warning! " + joint.size() + " items are tags and categories: " + joint);
}
for (String cat : cats.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat);
break;
}
}
for (String cat : tags.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat);
break;
}
}
pw.println(" Cats: " + Counters.toString(cats, nf));
pw.println(" Tags: " + Counters.toString(tags, nf));
pw.println(" " + starts.size() + " start categories: " + Counters.toString(starts, nf));
if (!puncts.isEmpty()) {
pw.println(" Puncts: " + Counters.toString(puncts, nf));
}
}
return sw.toString();
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class OpenIETest method clauses.
protected Set<String> clauses(String conll) {
List<CoreLabel> sentence = new ArrayList<>();
SemanticGraph tree = new SemanticGraph();
for (String line : conll.split("\n")) {
if (line.trim().equals("")) {
continue;
}
String[] fields = line.trim().split("\\s+");
int index = Integer.parseInt(fields[0]);
String word = fields[1];
CoreLabel label = mkWord(word, index);
sentence.add(label);
if (fields[2].equals("0")) {
tree.addRoot(new IndexedWord(label));
} else {
tree.addVertex(new IndexedWord(label));
}
if (fields.length > 4) {
label.setTag(fields[4]);
}
if (fields.length > 5) {
label.setNER(fields[5]);
}
if (fields.length > 6) {
label.setLemma(fields[6]);
}
}
int i = 0;
for (String line : conll.split("\n")) {
if (line.trim().equals("")) {
continue;
}
String[] fields = line.trim().split("\\s+");
int parent = Integer.parseInt(fields[2]);
String reln = fields[3];
if (parent > 0) {
tree.addEdge(new IndexedWord(sentence.get(parent - 1)), new IndexedWord(sentence.get(i)), new GrammaticalRelation(Language.English, reln, null, null), 1.0, false);
}
i += 1;
}
// Run extractor
ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true);
Set<String> clauses = new HashSet<>();
problem.search(triple -> {
clauses.add(triple.third.get().toString());
return true;
}, new LinearClassifier<>(new ClassicCounter<>()), ClauseSplitterSearchProblem.HARD_SPLITS, triple -> new ClassicCounter<String>() {
{
setCount("__undocumented_junit_no_classifier", 1.0);
}
}, 100000);
return clauses;
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class SieveCoreferenceSystem method printLinkWithContext.
/** print a coref link information including context and parse tree */
private static void printLinkWithContext(Logger logger, String header, IntTuple src, IntTuple dst, Document document, Semantics semantics) {
List<List<Mention>> orderedMentionsBySentence = document.getOrderedMentions();
List<List<Mention>> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence;
Mention srcMention = orderedMentionsBySentence.get(src.get(0)).get(src.get(1));
Mention dstMention = orderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
List<CoreLabel> srcSentence = srcMention.sentenceWords;
List<CoreLabel> dstSentence = dstMention.sentenceWords;
printLink(logger, header, src, dst, orderedMentionsBySentence);
printList(logger, "Mention:" + srcMention.spanToString(), "Gender:" + srcMention.gender.toString(), "Number:" + srcMention.number.toString(), "Animacy:" + srcMention.animacy.toString(), "Person:" + srcMention.person.toString(), "NER:" + srcMention.nerString, "Head:" + srcMention.headString, "Type:" + srcMention.mentionType.toString(), "utter: " + srcMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), "speakerID: " + srcMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), "twinless:" + srcMention.twinless);
logger.fine("Context:");
String p = "";
for (int i = 0; i < srcSentence.size(); i++) {
if (i == srcMention.startIndex) {
p += "[";
}
if (i == srcMention.endIndex) {
p += "]";
}
p += srcSentence.get(i).word() + " ";
}
logger.fine(p);
StringBuilder golds = new StringBuilder();
golds.append("Gold mentions in the sentence:\n");
Counter<Integer> mBegin = new ClassicCounter<>();
Counter<Integer> mEnd = new ClassicCounter<>();
for (Mention m : goldOrderedMentionsBySentence.get(src.get(0))) {
mBegin.incrementCount(m.startIndex);
mEnd.incrementCount(m.endIndex);
}
List<CoreLabel> l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(src.get(0)).get(CoreAnnotations.TokensAnnotation.class);
for (int i = 0; i < l.size(); i++) {
for (int j = 0; j < mEnd.getCount(i); j++) {
golds.append("]");
}
for (int j = 0; j < mBegin.getCount(i); j++) {
golds.append("[");
}
golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class));
golds.append(" ");
}
logger.fine(golds.toString());
printList(logger, "\nAntecedent:" + dstMention.spanToString(), "Gender:" + dstMention.gender.toString(), "Number:" + dstMention.number.toString(), "Animacy:" + dstMention.animacy.toString(), "Person:" + dstMention.person.toString(), "NER:" + dstMention.nerString, "Head:" + dstMention.headString, "Type:" + dstMention.mentionType.toString(), "utter: " + dstMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), "speakerID: " + dstMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), "twinless:" + dstMention.twinless);
logger.fine("Context:");
p = "";
for (int i = 0; i < dstSentence.size(); i++) {
if (i == dstMention.startIndex) {
p += "[";
}
if (i == dstMention.endIndex) {
p += "]";
}
p += dstSentence.get(i).word() + " ";
}
logger.fine(p);
golds = new StringBuilder();
golds.append("Gold mentions in the sentence:\n");
mBegin = new ClassicCounter<>();
mEnd = new ClassicCounter<>();
for (Mention m : goldOrderedMentionsBySentence.get(dst.get(0))) {
mBegin.incrementCount(m.startIndex);
mEnd.incrementCount(m.endIndex);
}
l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(dst.get(0)).get(CoreAnnotations.TokensAnnotation.class);
for (int i = 0; i < l.size(); i++) {
for (int j = 0; j < mEnd.getCount(i); j++) {
golds.append("]");
}
for (int j = 0; j < mBegin.getCount(i); j++) {
golds.append("[");
}
golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class));
golds.append(" ");
}
logger.fine(golds.toString());
logger.finer("\nMention:: --------------------------------------------------------");
try {
logger.finer(srcMention.dependency.toString());
}//throw new RuntimeException(e);}
catch (Exception e) {
}
logger.finer("Parse:");
logger.finer(formatPennTree(srcMention.contextParseTree));
logger.finer("\nAntecedent:: -----------------------------------------------------");
try {
logger.finer(dstMention.dependency.toString());
}//throw new RuntimeException(e);}
catch (Exception e) {
}
logger.finer("Parse:");
logger.finer(formatPennTree(dstMention.contextParseTree));
}
use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.
the class ClusteringCorefAlgorithm method runCoref.
@Override
public void runCoref(Document document) {
Map<Pair<Integer, Integer>, Boolean> mentionPairs = CorefUtils.getUnlabeledMentionPairs(document);
if (mentionPairs.size() == 0) {
return;
}
Compressor<String> compressor = new Compressor<>();
DocumentExamples examples = extractor.extract(0, document, mentionPairs, compressor);
Counter<Pair<Integer, Integer>> classificationScores = new ClassicCounter<>();
Counter<Pair<Integer, Integer>> rankingScores = new ClassicCounter<>();
Counter<Integer> anaphoricityScores = new ClassicCounter<>();
for (Example example : examples.examples) {
CorefUtils.checkForInterrupt();
Pair<Integer, Integer> mentionPair = new Pair<>(example.mentionId1, example.mentionId2);
classificationScores.incrementCount(mentionPair, classificationModel.predict(example, examples.mentionFeatures, compressor));
rankingScores.incrementCount(mentionPair, rankingModel.predict(example, examples.mentionFeatures, compressor));
if (!anaphoricityScores.containsKey(example.mentionId2)) {
anaphoricityScores.incrementCount(example.mentionId2, anaphoricityModel.predict(new Example(example, false), examples.mentionFeatures, compressor));
}
}
ClustererDoc doc = new ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString())));
for (Pair<Integer, Integer> mentionPair : clusterer.getClusterMerges(doc)) {
CorefUtils.mergeCoreferenceClusters(mentionPair, document);
}
}
Aggregations