Search in sources :

Example 1 with EnglishTreebankParserParams

use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.

the class TsarfatyEval method main.

/**
   * Run the scoring metric on guess/gold input. This method performs "Collinization." 
   * The default language is English.
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    int maxGuessYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    boolean skipGuess = false;
    boolean tagMode = false;
    String guessFile = null;
    String goldFile = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-y":
                    maxGoldYield = Integer.parseInt(args[++i].trim());
                    break;
                case "-t":
                    tagMode = true;
                    break;
                case "-v":
                    VERBOSE = true;
                    break;
                case "-g":
                    maxGuessYield = Integer.parseInt(args[++i].trim());
                    skipGuess = true;
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            //Required parameters
            goldFile = args[i++];
            guessFile = args[i];
            break;
        }
    }
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
    final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        final ArrayList<Label> guessSent = guess.yield();
        final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
        if (guessSent.size() > maxGuessYield) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            final ArrayList<Label> goldSent = gold.yield();
            final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
            if (goldSent.size() > maxGoldYield) {
                continue;
            } else if (goldChars.length() != guessChars.length()) {
                pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    eval.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 2 with EnglishTreebankParserParams

use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.

the class PunctFrequencyDist method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    String puncTag = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            puncTag = args[i++];
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    Counter<String> puncTypes = new ClassicCounter<>();
    for (Tree t : tb) {
        List<CoreLabel> yield = t.taggedLabeledYield();
        for (CoreLabel word : yield) if (word.tag().equals(puncTag))
            puncTypes.incrementCount(word.word());
    }
    List<String> biggestKeys = new ArrayList<>(puncTypes.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(puncTypes));
    PrintWriter pw = tlpp.pw();
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) puncTypes.getCount(wordType));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 3 with EnglishTreebankParserParams

use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.

the class UNKPrinter method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    Language lang = Language.English;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    PrintWriter pw = tlpp.pw();
    Options op = new Options();
    Options.LexOptions lexOptions = op.lexOptions;
    if (lang == Language.French) {
        lexOptions.useUnknownWordSignatures = 1;
        lexOptions.smartMutation = false;
        lexOptions.unknownSuffixSize = 2;
        lexOptions.unknownPrefixSize = 1;
    } else if (lang == Language.Arabic) {
        lexOptions.smartMutation = false;
        lexOptions.useUnknownWordSignatures = 9;
        lexOptions.unknownPrefixSize = 1;
        lexOptions.unknownSuffixSize = 1;
    }
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
    int computeAfter = (int) (0.50 * tb.size());
    Counter<String> vocab = new ClassicCounter<>();
    Counter<String> unkCounter = new ClassicCounter<>();
    int treeId = 0;
    for (Tree t : tb) {
        List<Label> yield = t.yield();
        int posId = 0;
        for (Label word : yield) {
            vocab.incrementCount(word.value());
            if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
                //          if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
                //            pw.println(word.value());
                unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
        }
        treeId++;
    }
    List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
    pw.close();
    pw.close();
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Lexicon(edu.stanford.nlp.parser.lexparser.Lexicon) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) HashIndex(edu.stanford.nlp.util.HashIndex) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 4 with EnglishTreebankParserParams

use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.

the class VocabFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    Counter<String> vocab = new ClassicCounter<>();
    for (Tree t : tb) {
        List<Label> yield = t.yield();
        for (Label word : yield) vocab.incrementCount(word.value());
    }
    List<String> biggestKeys = new ArrayList<>(vocab.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(vocab));
    PrintWriter pw = tlpp.pw();
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) vocab.getCount(wordType));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 5 with EnglishTreebankParserParams

use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.

the class RHSFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            rootMatch = TregexPattern.compile("@" + args[i++]);
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i++]);
        }
    }
    Counter<String> rhsCounter = new ClassicCounter<>();
    for (Tree t : tb) {
        TregexMatcher m = rootMatch.matcher(t);
        while (m.findNextMatchingNode()) {
            Tree match = m.getMatch();
            StringBuilder sb = new StringBuilder();
            for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
            rhsCounter.incrementCount(sb.toString().trim());
        }
    }
    List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
    PrintWriter pw = tlpp.pw();
    for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) PrintWriter(java.io.PrintWriter)

Aggregations

EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)8 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)8 Tree (edu.stanford.nlp.trees.Tree)8 Language (edu.stanford.nlp.international.Language)7 PrintWriter (java.io.PrintWriter)7 Label (edu.stanford.nlp.ling.Label)5 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)4 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)4 ArrayList (java.util.ArrayList)4 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)3 Treebank (edu.stanford.nlp.trees.Treebank)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 Map (java.util.Map)2 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)1 SemgrexMatcher (edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher)1 SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)1 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)1 GrammaticalStructureFactory (edu.stanford.nlp.trees.GrammaticalStructureFactory)1