Search in sources :

Example 1 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class TsarfatyEval method main.

/**
   * Run the scoring metric on guess/gold input. This method performs "Collinization." 
   * The default language is English.
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    int maxGuessYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    boolean skipGuess = false;
    boolean tagMode = false;
    String guessFile = null;
    String goldFile = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-y":
                    maxGoldYield = Integer.parseInt(args[++i].trim());
                    break;
                case "-t":
                    tagMode = true;
                    break;
                case "-v":
                    VERBOSE = true;
                    break;
                case "-g":
                    maxGuessYield = Integer.parseInt(args[++i].trim());
                    skipGuess = true;
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            //Required parameters
            goldFile = args[i++];
            guessFile = args[i];
            break;
        }
    }
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
    final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        final ArrayList<Label> guessSent = guess.yield();
        final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
        if (guessSent.size() > maxGuessYield) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            final ArrayList<Label> goldSent = gold.yield();
            final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
            if (goldSent.size() > maxGoldYield) {
                continue;
            } else if (goldChars.length() != guessChars.length()) {
                pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    eval.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 2 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class CollinsDepEval method main.

/**
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    Language LANGUAGE = PropertiesUtils.get(options, "l", Language.English, Language.class);
    int MAX_GOLD_YIELD = PropertiesUtils.getInt(options, "g", Integer.MAX_VALUE);
    int MAX_GUESS_YIELD = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    File goldFile = new File(parsedArgs[0]);
    File guessFile = new File(parsedArgs[1]);
    final TreebankLangParserParams tlpp = LANGUAGE.params;
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        if (guess.yield().size() > MAX_GUESS_YIELD) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            if (gold.yield().size() > MAX_GOLD_YIELD) {
                continue;
            } else if (evalGold.yield().size() != evalGuess.yield().size()) {
                pwOut.println("Yield mismatch at gold line " + goldLineId);
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            depEval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((MAX_GUESS_YIELD < Integer.MAX_VALUE) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    depEval.display(true, pwOut);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) File(java.io.File) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 3 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class TreebankFactoredLexiconStats method main.

//  private static String stripTag(String tag) {
//    if (tag.startsWith("DT")) {
//      String newTag = tag.substring(2, tag.length());
//      return newTag.length() > 0 ? newTag : tag;
//    }
//    return tag;
//  }
/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
        System.exit(-1);
    }
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    if (language.equals(Language.Arabic)) {
        String[] options = { "-arabicFactored" };
        tlpp.setOptionFlag(options, 0);
    } else {
        String[] options = { "-frenchFactored" };
        tlpp.setOptionFlag(options, 0);
    }
    Treebank tb = tlpp.diskTreebank();
    tb.loadPath(args[1]);
    MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
    String[] features = args[2].trim().split(",");
    for (String feature : features) {
        morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    // Counters
    Counter<String> wordTagCounter = new ClassicCounter<>(30000);
    Counter<String> morphTagCounter = new ClassicCounter<>(500);
    //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
    Counter<String> morphCounter = new ClassicCounter<>(500);
    Counter<String> wordCounter = new ClassicCounter<>(30000);
    Counter<String> tagCounter = new ClassicCounter<>(300);
    Counter<String> lemmaCounter = new ClassicCounter<>(25000);
    Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);
    Counter<String> richTagCounter = new ClassicCounter<>(1000);
    Counter<String> reducedTagCounter = new ClassicCounter<>(500);
    Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);
    Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();
    TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000);
    TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500);
    TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300);
    int numTrees = 0;
    for (Tree tree : tb) {
        for (Tree subTree : tree) {
            if (!subTree.isLeaf()) {
                tlpp.transformTree(subTree, tree);
            }
        }
        List<Label> pretermList = tree.preTerminalYield();
        List<Label> yield = tree.yield();
        assert yield.size() == pretermList.size();
        int yieldLen = yield.size();
        for (int i = 0; i < yieldLen; ++i) {
            String tag = pretermList.get(i).value();
            String word = yield.get(i).value();
            String morph = ((CoreLabel) yield.get(i)).originalText();
            // Note: if there is no lemma, then we use the surface form.
            Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
            String lemma = lemmaTag.first();
            String richTag = lemmaTag.second();
            // WSGDEBUG
            if (tag.contains("MW"))
                lemma += "-MWE";
            lemmaCounter.incrementCount(lemma);
            lemmaTagCounter.incrementCount(lemma + tag);
            richTagCounter.incrementCount(richTag);
            String reducedTag = morphoSpec.strToFeatures(richTag).toString();
            reducedTagCounter.incrementCount(reducedTag);
            reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
            wordTagCounter.incrementCount(word + tag);
            morphTagCounter.incrementCount(morph + tag);
            morphCounter.incrementCount(morph);
            wordCounter.incrementCount(word);
            tagCounter.incrementCount(tag);
            reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
            if (wordLemmaMap.containsKey(word)) {
                wordLemmaMap.get(word).add(lemma);
            } else {
                Set<String> lemmas = Generics.newHashSet(1);
                wordLemmaMap.put(word, lemmas);
            }
            lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
            reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
            tagReducedTagCounter.incrementCount(tag, reducedTag);
        }
        ++numTrees;
    }
    // Barf...
    System.out.println("Language: " + language.toString());
    System.out.printf("#trees:\t%d%n", numTrees);
    System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
    System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
    System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
    System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
    System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
    System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
    System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
    System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
    System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
    System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
    System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());
    // Extra
    System.out.println("==================");
    StringBuilder sbNoLemma = new StringBuilder();
    StringBuilder sbMultLemmas = new StringBuilder();
    for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
        String word = wordLemmas.getKey();
        Set<String> lemmas = wordLemmas.getValue();
        if (lemmas.size() == 0) {
            sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
            continue;
        }
        if (lemmas.size() > 1) {
            sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
            continue;
        }
        String lemma = lemmas.iterator().next();
        Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
        if (reducedTags.size() > 1) {
            System.out.printf("%s --> %s%n", word, lemma);
            for (String reducedTag : reducedTags) {
                int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
                String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
                System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
            }
            System.out.println();
        }
    }
    System.out.println("==================");
    System.out.println(sbNoLemma.toString());
    System.out.println(sbMultLemmas.toString());
    System.out.println("==================");
    List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
    Collections.sort(tags);
    for (String tag : tags) {
        System.out.println(tag);
        Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
        for (String reducedTag : reducedTags) {
            int count = tagReducedTagCounter.getCount(tag, reducedTag);
            //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
            System.out.printf("\t%s\t%d%n", reducedTag, count);
        }
        System.out.println();
    }
    System.out.println("==================");
}
Also used : FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) Set(java.util.Set) Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) TwoDimensionalIntCounter(edu.stanford.nlp.stats.TwoDimensionalIntCounter) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Map(java.util.Map)

Example 4 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class TreebankStats method main.

/**
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optArgDefs());
    String splitPrefix = options.getProperty("s", null);
    boolean SHOW_WORDS = PropertiesUtils.getBool(options, "w", false);
    boolean pathsAreFiles = PropertiesUtils.getBool(options, "f", false);
    boolean SHOW_OOV = PropertiesUtils.getBool(options, "o", false);
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Language language = Language.valueOf(parsedArgs[0]);
    List<String> corpusPaths = new ArrayList<>(parsedArgs.length - 1);
    for (int i = 1; i < parsedArgs.length; ++i) {
        corpusPaths.add(parsedArgs[i]);
    }
    TreebankLangParserParams tlpp = language.params;
    TreebankStats cs = new TreebankStats(language, corpusPaths, tlpp);
    if (splitPrefix != null) {
        if (!cs.useSplit(splitPrefix))
            log.info("Could not load split!");
    }
    cs.run(pathsAreFiles, SHOW_WORDS, SHOW_OOV);
}
Also used : Language(edu.stanford.nlp.international.Language) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties)

Example 5 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class RuleBranchingFactor method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage);
        System.exit(-1);
    }
    // Process command-line options
    Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
    String fileName = options.getProperty("");
    if (fileName == null || fileName.equals("")) {
        System.out.println(usage);
        System.exit(-1);
    }
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);
    // Statistics
    Counter<String> binaryRuleTypes = new ClassicCounter<>(20000);
    List<Integer> branchingFactors = new ArrayList<>(20000);
    int nTrees = 0;
    int nUnaryRules = 0;
    int nBinaryRules = 0;
    int binaryBranchingFactors = 0;
    // Read the treebank
    PrintWriter pw = tlpp.pw();
    for (Tree tree : tb) {
        if (tree.value().equals("ROOT")) {
            tree = tree.firstChild();
        }
        ++nTrees;
        for (Tree subTree : tree) {
            if (subTree.isPhrasal()) {
                if (subTree.numChildren() > 1) {
                    ++nBinaryRules;
                    branchingFactors.add(subTree.numChildren());
                    binaryBranchingFactors += subTree.numChildren();
                    binaryRuleTypes.incrementCount(treeToRuleString(subTree));
                } else {
                    ++nUnaryRules;
                }
            }
        }
    }
    double mean = (double) binaryBranchingFactors / (double) nBinaryRules;
    System.out.printf("#trees:\t%d%n", nTrees);
    System.out.printf("#binary:\t%d%n", nBinaryRules);
    System.out.printf("#binary types:\t%d%n", binaryRuleTypes.keySet().size());
    System.out.printf("mean branching:\t%.4f%n", mean);
    System.out.printf("stddev branching:\t%.4f%n", standardDeviation(branchingFactors, mean));
    System.out.printf("rule entropy:\t%.5f%n", Counters.entropy(binaryRuleTypes));
    System.out.printf("#unaries:\t%d%n", nUnaryRules);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Aggregations

Language (edu.stanford.nlp.international.Language)15 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)14 Tree (edu.stanford.nlp.trees.Tree)14 PrintWriter (java.io.PrintWriter)12 ArrayList (java.util.ArrayList)8 Label (edu.stanford.nlp.ling.Label)7 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)7 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)7 Treebank (edu.stanford.nlp.trees.Treebank)7 Properties (java.util.Properties)6 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)5 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 Map (java.util.Map)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)2 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)2 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)2 HashIndex (edu.stanford.nlp.util.HashIndex)2 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1