Search in sources :

Example 36 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class DcorefBenchmarkSlowITest method getCorefResults.

public static Counter<String> getCorefResults(String resultsString) throws IOException {
    Counter<String> results = new ClassicCounter<>();
    BufferedReader r = new BufferedReader(new StringReader(resultsString));
    for (String line; (line = r.readLine()) != null; ) {
        Matcher m1 = MENTION_PATTERN.matcher(line);
        if (m1.matches()) {
            results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
            results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
        }
        Matcher m2 = MUC_PATTERN.matcher(line);
        if (m2.matches()) {
            results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
            results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
        }
        Matcher m3 = BCUBED_PATTERN.matcher(line);
        if (m3.matches()) {
            results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
            results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
        }
        Matcher m4 = CEAFM_PATTERN.matcher(line);
        if (m4.matches()) {
            results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
            results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
        }
        Matcher m5 = CEAFE_PATTERN.matcher(line);
        if (m5.matches()) {
            results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
            results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
        }
        Matcher m6 = BLANC_PATTERN.matcher(line);
        if (m6.matches()) {
            results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
        }
        Matcher m7 = CONLL_PATTERN.matcher(line);
        if (m7.matches()) {
            results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
        }
    }
    return results;
}
Also used : Matcher(java.util.regex.Matcher) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader)

Example 37 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class Treebank method textualSummary.

/**
   * Return various statistics about the treebank (number of sentences,
   * words, tag set, etc.).
   *
   * @param tlp The TreebankLanguagePack used to determine punctuation and an
   *            appropriate character encoding
   * @return A big string for human consumption describing the treebank
   */
public String textualSummary(TreebankLanguagePack tlp) {
    int numTrees = 0;
    int numTreesLE40 = 0;
    int numNonUnaryRoots = 0;
    Tree nonUnaryEg = null;
    ClassicCounter<Tree> nonUnaries = new ClassicCounter<>();
    ClassicCounter<String> roots = new ClassicCounter<>();
    ClassicCounter<String> starts = new ClassicCounter<>();
    ClassicCounter<String> puncts = new ClassicCounter<>();
    int numUnenclosedLeaves = 0;
    int numLeaves = 0;
    int numNonPhrasal = 0;
    int numPreTerminalWithMultipleChildren = 0;
    int numWords = 0;
    int numTags = 0;
    int shortestSentence = Integer.MAX_VALUE;
    int longestSentence = 0;
    int numNullLabel = 0;
    Set<String> words = Generics.newHashSet();
    ClassicCounter<String> tags = new ClassicCounter<>();
    ClassicCounter<String> cats = new ClassicCounter<>();
    Tree leafEg = null;
    Tree preTerminalMultipleChildrenEg = null;
    Tree nullLabelEg = null;
    Tree rootRewritesAsTaggedWordEg = null;
    for (Tree t : this) {
        roots.incrementCount(t.value());
        numTrees++;
        int leng = t.yield().size();
        if (leng <= 40) {
            numTreesLE40++;
        }
        if (leng < shortestSentence) {
            shortestSentence = leng;
        }
        if (leng > longestSentence) {
            longestSentence = leng;
        }
        if (t.numChildren() > 1) {
            if (numNonUnaryRoots == 0) {
                nonUnaryEg = t;
            }
            if (numNonUnaryRoots < 100) {
                nonUnaries.incrementCount(t.localTree());
            }
            numNonUnaryRoots++;
        } else if (t.isLeaf()) {
            numUnenclosedLeaves++;
        } else {
            Tree t2 = t.firstChild();
            if (t2.isLeaf()) {
                numLeaves++;
                leafEg = t;
            } else if (t2.isPreTerminal()) {
                if (numNonPhrasal == 0) {
                    rootRewritesAsTaggedWordEg = t;
                }
                numNonPhrasal++;
            }
            starts.incrementCount(t2.value());
        }
        for (Tree subtree : t) {
            Label lab = subtree.label();
            if (lab == null || lab.value() == null || "".equals(lab.value())) {
                if (numNullLabel == 0) {
                    nullLabelEg = subtree;
                }
                numNullLabel++;
                if (lab == null) {
                    subtree.setLabel(new StringLabel(""));
                } else if (lab.value() == null) {
                    subtree.label().setValue("");
                }
            }
            if (subtree.isLeaf()) {
                numWords++;
                words.add(subtree.value());
            } else if (subtree.isPreTerminal()) {
                numTags++;
                tags.incrementCount(subtree.value());
                if (tlp != null && tlp.isPunctuationTag(subtree.value())) {
                    puncts.incrementCount(subtree.firstChild().value());
                }
            } else if (subtree.isPhrasal()) {
                boolean hasLeafChild = false;
                for (Tree kt : subtree.children()) {
                    if (kt.isLeaf()) {
                        hasLeafChild = true;
                    }
                }
                if (hasLeafChild) {
                    numPreTerminalWithMultipleChildren++;
                    if (preTerminalMultipleChildrenEg == null) {
                        preTerminalMultipleChildrenEg = subtree;
                    }
                }
                cats.incrementCount(subtree.value());
            } else {
                throw new IllegalStateException("Treebank: Bad tree in treebank!: " + subtree);
            }
        }
    }
    StringWriter sw = new StringWriter(2000);
    PrintWriter pw = new PrintWriter(sw);
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(0);
    pw.println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
    if (numTrees > 0) {
        if (numTags != numWords) {
            pw.println("  Warning! numTags differs and is " + numTags);
        }
        if (roots.size() == 1) {
            String root = (String) roots.keySet().toArray()[0];
            pw.println("  The root category is: " + root);
        } else {
            pw.println("  Warning! " + roots.size() + " different roots in treebank: " + Counters.toString(roots, nf));
        }
        if (numNonUnaryRoots > 0) {
            pw.print("  Warning! " + numNonUnaryRoots + " trees without unary initial rewrite.  ");
            if (numNonUnaryRoots > 100) {
                pw.print("First 100 ");
            }
            pw.println("Rewrites: " + Counters.toString(nonUnaries, nf));
            pw.println("    Example: " + nonUnaryEg);
        }
        if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) {
            pw.println("  Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
            if (numLeaves > 0) {
                pw.println("  Example bad root rewrites as leaf: " + leafEg);
            }
            if (numNonPhrasal > 0) {
                pw.println("  Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
            }
        }
        if (numNullLabel > 0) {
            pw.println("  Warning!  " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
            pw.println("    " + nullLabelEg);
        }
        if (numPreTerminalWithMultipleChildren > 0) {
            pw.println("  Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
            pw.println("    Example: " + preTerminalMultipleChildrenEg);
        }
        pw.println("  Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
        pw.println("  " + cats.size() + " phrasal category types, " + tags.size() + " tag types, and " + words.size() + " word types");
        String[] empties = { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
        // What a dopey choice using 0 as an empty element name!!
        // The problem with the below is that words aren't turned into a basic
        // category, but empties commonly are indexed....  Would need to look
        // for them with a suffix of -[0-9]+
        Set<String> knownEmpties = Generics.newHashSet(Arrays.asList(empties));
        Set<String> emptiesIntersection = Sets.intersection(words, knownEmpties);
        if (!emptiesIntersection.isEmpty()) {
            pw.println("  Caution! " + emptiesIntersection.size() + " word types are known empty elements: " + emptiesIntersection);
        }
        Set<String> joint = Sets.intersection(cats.keySet(), tags.keySet());
        if (!joint.isEmpty()) {
            pw.println("  Warning! " + joint.size() + " items are tags and categories: " + joint);
        }
        for (String cat : cats.keySet()) {
            if (cat != null && cat.contains("@")) {
                pw.println("  Warning!!  Stanford Parser does not work with categories containing '@' like: " + cat);
                break;
            }
        }
        for (String cat : tags.keySet()) {
            if (cat != null && cat.contains("@")) {
                pw.println("  Warning!!  Stanford Parser does not work with tags containing '@' like: " + cat);
                break;
            }
        }
        pw.println("    Cats: " + Counters.toString(cats, nf));
        pw.println("    Tags: " + Counters.toString(tags, nf));
        pw.println("    " + starts.size() + " start categories: " + Counters.toString(starts, nf));
        if (!puncts.isEmpty()) {
            pw.println("    Puncts: " + Counters.toString(puncts, nf));
        }
    }
    return sw.toString();
}
Also used : StringLabel(edu.stanford.nlp.ling.StringLabel) Label(edu.stanford.nlp.ling.Label) StringLabel(edu.stanford.nlp.ling.StringLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) NumberFormat(java.text.NumberFormat)

Example 38 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class OpenIETest method clauses.

protected Set<String> clauses(String conll) {
    List<CoreLabel> sentence = new ArrayList<>();
    SemanticGraph tree = new SemanticGraph();
    for (String line : conll.split("\n")) {
        if (line.trim().equals("")) {
            continue;
        }
        String[] fields = line.trim().split("\\s+");
        int index = Integer.parseInt(fields[0]);
        String word = fields[1];
        CoreLabel label = mkWord(word, index);
        sentence.add(label);
        if (fields[2].equals("0")) {
            tree.addRoot(new IndexedWord(label));
        } else {
            tree.addVertex(new IndexedWord(label));
        }
        if (fields.length > 4) {
            label.setTag(fields[4]);
        }
        if (fields.length > 5) {
            label.setNER(fields[5]);
        }
        if (fields.length > 6) {
            label.setLemma(fields[6]);
        }
    }
    int i = 0;
    for (String line : conll.split("\n")) {
        if (line.trim().equals("")) {
            continue;
        }
        String[] fields = line.trim().split("\\s+");
        int parent = Integer.parseInt(fields[2]);
        String reln = fields[3];
        if (parent > 0) {
            tree.addEdge(new IndexedWord(sentence.get(parent - 1)), new IndexedWord(sentence.get(i)), new GrammaticalRelation(Language.English, reln, null, null), 1.0, false);
        }
        i += 1;
    }
    // Run extractor
    ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true);
    Set<String> clauses = new HashSet<>();
    problem.search(triple -> {
        clauses.add(triple.third.get().toString());
        return true;
    }, new LinearClassifier<>(new ClassicCounter<>()), ClauseSplitterSearchProblem.HARD_SPLITS, triple -> new ClassicCounter<String>() {

        {
            setCount("__undocumented_junit_no_classifier", 1.0);
        }
    }, 100000);
    return clauses;
}
Also used : ArrayList(java.util.ArrayList) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) GrammaticalRelation(edu.stanford.nlp.trees.GrammaticalRelation) IndexedWord(edu.stanford.nlp.ling.IndexedWord) HashSet(java.util.HashSet)

Example 39 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class SieveCoreferenceSystem method printLinkWithContext.

/** print a coref link information including context and parse tree */
private static void printLinkWithContext(Logger logger, String header, IntTuple src, IntTuple dst, Document document, Semantics semantics) {
    List<List<Mention>> orderedMentionsBySentence = document.getOrderedMentions();
    List<List<Mention>> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence;
    Mention srcMention = orderedMentionsBySentence.get(src.get(0)).get(src.get(1));
    Mention dstMention = orderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
    List<CoreLabel> srcSentence = srcMention.sentenceWords;
    List<CoreLabel> dstSentence = dstMention.sentenceWords;
    printLink(logger, header, src, dst, orderedMentionsBySentence);
    printList(logger, "Mention:" + srcMention.spanToString(), "Gender:" + srcMention.gender.toString(), "Number:" + srcMention.number.toString(), "Animacy:" + srcMention.animacy.toString(), "Person:" + srcMention.person.toString(), "NER:" + srcMention.nerString, "Head:" + srcMention.headString, "Type:" + srcMention.mentionType.toString(), "utter: " + srcMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), "speakerID: " + srcMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), "twinless:" + srcMention.twinless);
    logger.fine("Context:");
    String p = "";
    for (int i = 0; i < srcSentence.size(); i++) {
        if (i == srcMention.startIndex) {
            p += "[";
        }
        if (i == srcMention.endIndex) {
            p += "]";
        }
        p += srcSentence.get(i).word() + " ";
    }
    logger.fine(p);
    StringBuilder golds = new StringBuilder();
    golds.append("Gold mentions in the sentence:\n");
    Counter<Integer> mBegin = new ClassicCounter<>();
    Counter<Integer> mEnd = new ClassicCounter<>();
    for (Mention m : goldOrderedMentionsBySentence.get(src.get(0))) {
        mBegin.incrementCount(m.startIndex);
        mEnd.incrementCount(m.endIndex);
    }
    List<CoreLabel> l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(src.get(0)).get(CoreAnnotations.TokensAnnotation.class);
    for (int i = 0; i < l.size(); i++) {
        for (int j = 0; j < mEnd.getCount(i); j++) {
            golds.append("]");
        }
        for (int j = 0; j < mBegin.getCount(i); j++) {
            golds.append("[");
        }
        golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class));
        golds.append(" ");
    }
    logger.fine(golds.toString());
    printList(logger, "\nAntecedent:" + dstMention.spanToString(), "Gender:" + dstMention.gender.toString(), "Number:" + dstMention.number.toString(), "Animacy:" + dstMention.animacy.toString(), "Person:" + dstMention.person.toString(), "NER:" + dstMention.nerString, "Head:" + dstMention.headString, "Type:" + dstMention.mentionType.toString(), "utter: " + dstMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), "speakerID: " + dstMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), "twinless:" + dstMention.twinless);
    logger.fine("Context:");
    p = "";
    for (int i = 0; i < dstSentence.size(); i++) {
        if (i == dstMention.startIndex) {
            p += "[";
        }
        if (i == dstMention.endIndex) {
            p += "]";
        }
        p += dstSentence.get(i).word() + " ";
    }
    logger.fine(p);
    golds = new StringBuilder();
    golds.append("Gold mentions in the sentence:\n");
    mBegin = new ClassicCounter<>();
    mEnd = new ClassicCounter<>();
    for (Mention m : goldOrderedMentionsBySentence.get(dst.get(0))) {
        mBegin.incrementCount(m.startIndex);
        mEnd.incrementCount(m.endIndex);
    }
    l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(dst.get(0)).get(CoreAnnotations.TokensAnnotation.class);
    for (int i = 0; i < l.size(); i++) {
        for (int j = 0; j < mEnd.getCount(i); j++) {
            golds.append("]");
        }
        for (int j = 0; j < mBegin.getCount(i); j++) {
            golds.append("[");
        }
        golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class));
        golds.append(" ");
    }
    logger.fine(golds.toString());
    logger.finer("\nMention:: --------------------------------------------------------");
    try {
        logger.finer(srcMention.dependency.toString());
    }//throw new RuntimeException(e);}
     catch (Exception e) {
    }
    logger.finer("Parse:");
    logger.finer(formatPennTree(srcMention.contextParseTree));
    logger.finer("\nAntecedent:: -----------------------------------------------------");
    try {
        logger.finer(dstMention.dependency.toString());
    }//throw new RuntimeException(e);}
     catch (Exception e) {
    }
    logger.finer("Parse:");
    logger.finer(formatPennTree(dstMention.contextParseTree));
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CorefMention(edu.stanford.nlp.dcoref.CorefChain.CorefMention) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) ArrayList(java.util.ArrayList) List(java.util.List)

Example 40 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class ClusteringCorefAlgorithm method runCoref.

@Override
public void runCoref(Document document) {
    Map<Pair<Integer, Integer>, Boolean> mentionPairs = CorefUtils.getUnlabeledMentionPairs(document);
    if (mentionPairs.size() == 0) {
        return;
    }
    Compressor<String> compressor = new Compressor<>();
    DocumentExamples examples = extractor.extract(0, document, mentionPairs, compressor);
    Counter<Pair<Integer, Integer>> classificationScores = new ClassicCounter<>();
    Counter<Pair<Integer, Integer>> rankingScores = new ClassicCounter<>();
    Counter<Integer> anaphoricityScores = new ClassicCounter<>();
    for (Example example : examples.examples) {
        CorefUtils.checkForInterrupt();
        Pair<Integer, Integer> mentionPair = new Pair<>(example.mentionId1, example.mentionId2);
        classificationScores.incrementCount(mentionPair, classificationModel.predict(example, examples.mentionFeatures, compressor));
        rankingScores.incrementCount(mentionPair, rankingModel.predict(example, examples.mentionFeatures, compressor));
        if (!anaphoricityScores.containsKey(example.mentionId2)) {
            anaphoricityScores.incrementCount(example.mentionId2, anaphoricityModel.predict(new Example(example, false), examples.mentionFeatures, compressor));
        }
    }
    ClustererDoc doc = new ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString())));
    for (Pair<Integer, Integer> mentionPair : clusterer.getClusterMerges(doc)) {
        CorefUtils.mergeCoreferenceClusters(mentionPair, document);
    }
}
Also used : ClustererDoc(edu.stanford.nlp.coref.statistical.ClustererDataLoader.ClustererDoc) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Pair(edu.stanford.nlp.util.Pair)

Aggregations

ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)69 CoreLabel (edu.stanford.nlp.ling.CoreLabel)27 ArrayList (java.util.ArrayList)21 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)18 Tree (edu.stanford.nlp.trees.Tree)13 Pair (edu.stanford.nlp.util.Pair)11 Counter (edu.stanford.nlp.stats.Counter)10 List (java.util.List)10 Mention (edu.stanford.nlp.coref.data.Mention)8 Language (edu.stanford.nlp.international.Language)7 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)7 CoreMap (edu.stanford.nlp.util.CoreMap)7 IOUtils (edu.stanford.nlp.io.IOUtils)6 Label (edu.stanford.nlp.ling.Label)6 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)6 PrintWriter (java.io.PrintWriter)6 java.util (java.util)6 HashSet (java.util.HashSet)6 RVFDatum (edu.stanford.nlp.ling.RVFDatum)5 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)5