Search in sources :

Example 6 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class GrammarCompactor method convertGraphsToGrammar.

/**
   * @param graphs      a Map from String categories to TransducerGraph objects
   * @param unaryRules  is a Set of UnaryRule objects that we need to add
   * @param binaryRules is a Set of BinaryRule objects that we need to add
   * @return a new Pair of UnaryGrammar, BinaryGrammar
   */
protected Pair<UnaryGrammar, BinaryGrammar> convertGraphsToGrammar(Set<TransducerGraph> graphs, Set<UnaryRule> unaryRules, Set<BinaryRule> binaryRules) {
    // first go through all the existing rules and number them with new numberer
    newStateIndex = new HashIndex<>();
    for (UnaryRule rule : unaryRules) {
        String parent = stateIndex.get(rule.parent);
        rule.parent = newStateIndex.addToIndex(parent);
        String child = stateIndex.get(rule.child);
        rule.child = newStateIndex.addToIndex(child);
    }
    for (BinaryRule rule : binaryRules) {
        String parent = stateIndex.get(rule.parent);
        rule.parent = newStateIndex.addToIndex(parent);
        String leftChild = stateIndex.get(rule.leftChild);
        rule.leftChild = newStateIndex.addToIndex(leftChild);
        String rightChild = stateIndex.get(rule.rightChild);
        rule.rightChild = newStateIndex.addToIndex(rightChild);
    }
    // now go through the graphs and add the rules
    for (TransducerGraph graph : graphs) {
        Object startNode = graph.getStartNode();
        for (Arc arc : graph.getArcs()) {
            // TODO: make sure these are the strings we're looking for
            String source = arc.getSourceNode().toString();
            String target = arc.getTargetNode().toString();
            Object input = arc.getInput();
            String inputString = input.toString();
            double output = ((Double) arc.getOutput()).doubleValue();
            if (source.equals(startNode)) {
                // make a UnaryRule
                UnaryRule ur = new UnaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(inputString), smartNegate(output));
                unaryRules.add(ur);
            } else if (inputString.equals(END) || inputString.equals(EPSILON)) {
                // make a UnaryRule
                UnaryRule ur = new UnaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(source), smartNegate(output));
                unaryRules.add(ur);
            } else {
                // make a BinaryRule
                // figure out whether the input was generated on the left or right
                int length = inputString.length();
                char leftOrRight = inputString.charAt(length - 1);
                inputString = inputString.substring(0, length - 1);
                BinaryRule br;
                if (leftOrRight == '<' || leftOrRight == '[') {
                    br = new BinaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(inputString), newStateIndex.addToIndex(source), smartNegate(output));
                } else if (leftOrRight == '>' || leftOrRight == ']') {
                    br = new BinaryRule(newStateIndex.addToIndex(target), newStateIndex.addToIndex(source), newStateIndex.addToIndex(inputString), smartNegate(output));
                } else {
                    throw new RuntimeException("Arc input is in unexpected format: " + arc);
                }
                binaryRules.add(br);
            }
        }
    }
    // by now, the unaryRules and binaryRules Sets have old untouched and new rules with scores
    ClassicCounter<String> symbolCounter = new ClassicCounter<>();
    if (outputType == RAW_COUNTS) {
        // so we count parent symbol occurrences
        for (UnaryRule rule : unaryRules) {
            symbolCounter.incrementCount(newStateIndex.get(rule.parent), rule.score);
        }
        for (BinaryRule rule : binaryRules) {
            symbolCounter.incrementCount(newStateIndex.get(rule.parent), rule.score);
        }
    }
    // now we put the rules in the grammars
    // this should be smaller than last one
    int numStates = newStateIndex.size();
    int numRules = 0;
    UnaryGrammar ug = new UnaryGrammar(newStateIndex);
    BinaryGrammar bg = new BinaryGrammar(newStateIndex);
    for (UnaryRule rule : unaryRules) {
        if (outputType == RAW_COUNTS) {
            double count = symbolCounter.getCount(newStateIndex.get(rule.parent));
            rule.score = (float) Math.log(rule.score / count);
        }
        ug.addRule(rule);
        numRules++;
    }
    for (BinaryRule rule : binaryRules) {
        if (outputType == RAW_COUNTS) {
            double count = symbolCounter.getCount(newStateIndex.get(rule.parent));
            rule.score = (float) Math.log((rule.score - op.trainOptions.ruleDiscount) / count);
        }
        bg.addRule(rule);
        numRules++;
    }
    if (verbose) {
        System.out.println("Number of minimized rules: " + numRules);
        System.out.println("Number of minimized states: " + newStateIndex.size());
    }
    ug.purgeRules();
    bg.splitRules();
    return new Pair<>(ug, bg);
}
Also used : Arc(edu.stanford.nlp.fsm.TransducerGraph.Arc) TransducerGraph(edu.stanford.nlp.fsm.TransducerGraph) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Pair(edu.stanford.nlp.util.Pair)

Example 7 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class ChineseCorefBenchmarkSlowITest method getCorefResults.

private static Counter<String> getCorefResults(String resultsString) throws IOException {
    Counter<String> results = new ClassicCounter<String>();
    BufferedReader r = new BufferedReader(new StringReader(resultsString));
    for (String line; (line = r.readLine()) != null; ) {
        Matcher m1 = MENTION_PATTERN.matcher(line);
        if (m1.matches()) {
            results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
            results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
        }
        Matcher m2 = MUC_PATTERN.matcher(line);
        if (m2.matches()) {
            results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
            results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
        }
        Matcher m3 = BCUBED_PATTERN.matcher(line);
        if (m3.matches()) {
            results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
            results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
        }
        Matcher m4 = CEAFM_PATTERN.matcher(line);
        if (m4.matches()) {
            results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
            results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
        }
        Matcher m5 = CEAFE_PATTERN.matcher(line);
        if (m5.matches()) {
            results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
            results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
        }
        Matcher m6 = BLANC_PATTERN.matcher(line);
        if (m6.matches()) {
            results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
        }
        Matcher m7 = CONLL_PATTERN.matcher(line);
        if (m7.matches()) {
            results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
        }
    }
    return results;
}
Also used : Matcher(java.util.regex.Matcher) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader)

Example 8 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class DcorefBenchmarkSlowITest method getCorefResults.

public static Counter<String> getCorefResults(String resultsString) throws IOException {
    Counter<String> results = new ClassicCounter<>();
    BufferedReader r = new BufferedReader(new StringReader(resultsString));
    for (String line; (line = r.readLine()) != null; ) {
        Matcher m1 = MENTION_PATTERN.matcher(line);
        if (m1.matches()) {
            results.setCount(MENTION_TP, Double.parseDouble(m1.group(1)));
            results.setCount(MENTION_F1, Double.parseDouble(m1.group(2)));
        }
        Matcher m2 = MUC_PATTERN.matcher(line);
        if (m2.matches()) {
            results.setCount(MUC_TP, Double.parseDouble(m2.group(1)));
            results.setCount(MUC_F1, Double.parseDouble(m2.group(2)));
        }
        Matcher m3 = BCUBED_PATTERN.matcher(line);
        if (m3.matches()) {
            results.setCount(BCUBED_TP, Double.parseDouble(m3.group(1)));
            results.setCount(BCUBED_F1, Double.parseDouble(m3.group(2)));
        }
        Matcher m4 = CEAFM_PATTERN.matcher(line);
        if (m4.matches()) {
            results.setCount(CEAFM_TP, Double.parseDouble(m4.group(1)));
            results.setCount(CEAFM_F1, Double.parseDouble(m4.group(2)));
        }
        Matcher m5 = CEAFE_PATTERN.matcher(line);
        if (m5.matches()) {
            results.setCount(CEAFE_TP, Double.parseDouble(m5.group(1)));
            results.setCount(CEAFE_F1, Double.parseDouble(m5.group(2)));
        }
        Matcher m6 = BLANC_PATTERN.matcher(line);
        if (m6.matches()) {
            results.setCount(BLANC_F1, Double.parseDouble(m6.group(1)));
        }
        Matcher m7 = CONLL_PATTERN.matcher(line);
        if (m7.matches()) {
            results.setCount(CONLL_SCORE, Double.parseDouble(m7.group(1)));
        }
    }
    return results;
}
Also used : Matcher(java.util.regex.Matcher) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader)

Example 9 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class Treebank method textualSummary.

/**
   * Return various statistics about the treebank (number of sentences,
   * words, tag set, etc.).
   *
   * @param tlp The TreebankLanguagePack used to determine punctuation and an
   *            appropriate character encoding
   * @return A big string for human consumption describing the treebank
   */
public String textualSummary(TreebankLanguagePack tlp) {
    int numTrees = 0;
    int numTreesLE40 = 0;
    int numNonUnaryRoots = 0;
    Tree nonUnaryEg = null;
    ClassicCounter<Tree> nonUnaries = new ClassicCounter<>();
    ClassicCounter<String> roots = new ClassicCounter<>();
    ClassicCounter<String> starts = new ClassicCounter<>();
    ClassicCounter<String> puncts = new ClassicCounter<>();
    int numUnenclosedLeaves = 0;
    int numLeaves = 0;
    int numNonPhrasal = 0;
    int numPreTerminalWithMultipleChildren = 0;
    int numWords = 0;
    int numTags = 0;
    int shortestSentence = Integer.MAX_VALUE;
    int longestSentence = 0;
    int numNullLabel = 0;
    Set<String> words = Generics.newHashSet();
    ClassicCounter<String> tags = new ClassicCounter<>();
    ClassicCounter<String> cats = new ClassicCounter<>();
    Tree leafEg = null;
    Tree preTerminalMultipleChildrenEg = null;
    Tree nullLabelEg = null;
    Tree rootRewritesAsTaggedWordEg = null;
    for (Tree t : this) {
        roots.incrementCount(t.value());
        numTrees++;
        int leng = t.yield().size();
        if (leng <= 40) {
            numTreesLE40++;
        }
        if (leng < shortestSentence) {
            shortestSentence = leng;
        }
        if (leng > longestSentence) {
            longestSentence = leng;
        }
        if (t.numChildren() > 1) {
            if (numNonUnaryRoots == 0) {
                nonUnaryEg = t;
            }
            if (numNonUnaryRoots < 100) {
                nonUnaries.incrementCount(t.localTree());
            }
            numNonUnaryRoots++;
        } else if (t.isLeaf()) {
            numUnenclosedLeaves++;
        } else {
            Tree t2 = t.firstChild();
            if (t2.isLeaf()) {
                numLeaves++;
                leafEg = t;
            } else if (t2.isPreTerminal()) {
                if (numNonPhrasal == 0) {
                    rootRewritesAsTaggedWordEg = t;
                }
                numNonPhrasal++;
            }
            starts.incrementCount(t2.value());
        }
        for (Tree subtree : t) {
            Label lab = subtree.label();
            if (lab == null || lab.value() == null || "".equals(lab.value())) {
                if (numNullLabel == 0) {
                    nullLabelEg = subtree;
                }
                numNullLabel++;
                if (lab == null) {
                    subtree.setLabel(new StringLabel(""));
                } else if (lab.value() == null) {
                    subtree.label().setValue("");
                }
            }
            if (subtree.isLeaf()) {
                numWords++;
                words.add(subtree.value());
            } else if (subtree.isPreTerminal()) {
                numTags++;
                tags.incrementCount(subtree.value());
                if (tlp != null && tlp.isPunctuationTag(subtree.value())) {
                    puncts.incrementCount(subtree.firstChild().value());
                }
            } else if (subtree.isPhrasal()) {
                boolean hasLeafChild = false;
                for (Tree kt : subtree.children()) {
                    if (kt.isLeaf()) {
                        hasLeafChild = true;
                    }
                }
                if (hasLeafChild) {
                    numPreTerminalWithMultipleChildren++;
                    if (preTerminalMultipleChildrenEg == null) {
                        preTerminalMultipleChildrenEg = subtree;
                    }
                }
                cats.incrementCount(subtree.value());
            } else {
                throw new IllegalStateException("Treebank: Bad tree in treebank!: " + subtree);
            }
        }
    }
    StringWriter sw = new StringWriter(2000);
    PrintWriter pw = new PrintWriter(sw);
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(0);
    pw.println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
    if (numTrees > 0) {
        if (numTags != numWords) {
            pw.println("  Warning! numTags differs and is " + numTags);
        }
        if (roots.size() == 1) {
            String root = (String) roots.keySet().toArray()[0];
            pw.println("  The root category is: " + root);
        } else {
            pw.println("  Warning! " + roots.size() + " different roots in treebank: " + Counters.toString(roots, nf));
        }
        if (numNonUnaryRoots > 0) {
            pw.print("  Warning! " + numNonUnaryRoots + " trees without unary initial rewrite.  ");
            if (numNonUnaryRoots > 100) {
                pw.print("First 100 ");
            }
            pw.println("Rewrites: " + Counters.toString(nonUnaries, nf));
            pw.println("    Example: " + nonUnaryEg);
        }
        if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) {
            pw.println("  Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
            if (numLeaves > 0) {
                pw.println("  Example bad root rewrites as leaf: " + leafEg);
            }
            if (numNonPhrasal > 0) {
                pw.println("  Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
            }
        }
        if (numNullLabel > 0) {
            pw.println("  Warning!  " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
            pw.println("    " + nullLabelEg);
        }
        if (numPreTerminalWithMultipleChildren > 0) {
            pw.println("  Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
            pw.println("    Example: " + preTerminalMultipleChildrenEg);
        }
        pw.println("  Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
        pw.println("  " + cats.size() + " phrasal category types, " + tags.size() + " tag types, and " + words.size() + " word types");
        String[] empties = { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
        // What a dopey choice using 0 as an empty element name!!
        // The problem with the below is that words aren't turned into a basic
        // category, but empties commonly are indexed....  Would need to look
        // for them with a suffix of -[0-9]+
        Set<String> knownEmpties = Generics.newHashSet(Arrays.asList(empties));
        Set<String> emptiesIntersection = Sets.intersection(words, knownEmpties);
        if (!emptiesIntersection.isEmpty()) {
            pw.println("  Caution! " + emptiesIntersection.size() + " word types are known empty elements: " + emptiesIntersection);
        }
        Set<String> joint = Sets.intersection(cats.keySet(), tags.keySet());
        if (!joint.isEmpty()) {
            pw.println("  Warning! " + joint.size() + " items are tags and categories: " + joint);
        }
        for (String cat : cats.keySet()) {
            if (cat != null && cat.contains("@")) {
                pw.println("  Warning!!  Stanford Parser does not work with categories containing '@' like: " + cat);
                break;
            }
        }
        for (String cat : tags.keySet()) {
            if (cat != null && cat.contains("@")) {
                pw.println("  Warning!!  Stanford Parser does not work with tags containing '@' like: " + cat);
                break;
            }
        }
        pw.println("    Cats: " + Counters.toString(cats, nf));
        pw.println("    Tags: " + Counters.toString(tags, nf));
        pw.println("    " + starts.size() + " start categories: " + Counters.toString(starts, nf));
        if (!puncts.isEmpty()) {
            pw.println("    Puncts: " + Counters.toString(puncts, nf));
        }
    }
    return sw.toString();
}
Also used : StringLabel(edu.stanford.nlp.ling.StringLabel) Label(edu.stanford.nlp.ling.Label) StringLabel(edu.stanford.nlp.ling.StringLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) NumberFormat(java.text.NumberFormat)

Example 10 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class SimpleSentiment method featurize.

/**
   * Featurize a given sentence.
   *
   * @param sentence The sentence to featurize.
   *
   * @return A counter encoding the featurized sentence.
   */
private static Counter<String> featurize(CoreMap sentence) {
    ClassicCounter<String> features = new ClassicCounter<>();
    String lastLemma = "^";
    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        String lemma = token.lemma().toLowerCase();
        if (number.matcher(lemma).matches()) {
            features.incrementCount("**num**");
        } else {
            features.incrementCount(lemma);
        }
        if (alpha.matcher(lemma).matches()) {
            features.incrementCount(lastLemma + "__" + lemma);
            lastLemma = lemma;
        }
    }
    features.incrementCount(lastLemma + "__$");
    return features;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Aggregations

ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)69 CoreLabel (edu.stanford.nlp.ling.CoreLabel)27 ArrayList (java.util.ArrayList)21 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)18 Tree (edu.stanford.nlp.trees.Tree)13 Pair (edu.stanford.nlp.util.Pair)11 Counter (edu.stanford.nlp.stats.Counter)10 List (java.util.List)10 Mention (edu.stanford.nlp.coref.data.Mention)8 Language (edu.stanford.nlp.international.Language)7 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)7 CoreMap (edu.stanford.nlp.util.CoreMap)7 IOUtils (edu.stanford.nlp.io.IOUtils)6 Label (edu.stanford.nlp.ling.Label)6 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)6 PrintWriter (java.io.PrintWriter)6 java.util (java.util)6 HashSet (java.util.HashSet)6 RVFDatum (edu.stanford.nlp.ling.RVFDatum)5 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)5