Search in sources :

Example 11 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class RHSFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            rootMatch = TregexPattern.compile("@" + args[i++]);
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i++]);
        }
    }
    Counter<String> rhsCounter = new ClassicCounter<>();
    for (Tree t : tb) {
        TregexMatcher m = rootMatch.matcher(t);
        while (m.findNextMatchingNode()) {
            Tree match = m.getMatch();
            StringBuilder sb = new StringBuilder();
            for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
            rhsCounter.incrementCount(sb.toString().trim());
        }
    }
    List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
    PrintWriter pw = tlpp.pw();
    for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) PrintWriter(java.io.PrintWriter)

Example 12 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class TaggingEval method main.

/**
   * Run the scoring metric on guess/gold input. This method performs "Collinization."
   * The default language is English.
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    String encoding = "UTF-8";
    String guessFile = null;
    String goldFile = null;
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
    for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
        if (opt.getKey() == null)
            continue;
        if (opt.getKey().equals("-l")) {
            Language lang = Language.valueOf(opt.getValue()[0].trim());
            tlpp = lang.params;
        } else if (opt.getKey().equals("-y")) {
            maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
        } else if (opt.getKey().equals("-v")) {
            VERBOSE = true;
        } else if (opt.getKey().equals("-c")) {
            TaggingEval.doCatLevelEval = true;
        } else if (opt.getKey().equals("-e")) {
            encoding = opt.getValue()[0];
        } else {
            log.info(usage.toString());
            System.exit(-1);
        }
        //Non-option arguments located at key null
        String[] rest = argsMap.get(null);
        if (rest == null || rest.length < minArgs) {
            log.info(usage.toString());
            System.exit(-1);
        }
        goldFile = rest[0];
        guessFile = rest[1];
    }
    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final TaggingEval metric = new TaggingEval("Tagging LP/LR");
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeMap(java.util.TreeMap) Map(java.util.Map) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 13 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class UnlabeledAttachmentEval method main.

/**
   * Run the Evalb scoring metric on guess/gold input. The default language is English.
   *
   * @param args
   */
public static void main(String[] args) {
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    String encoding = "UTF-8";
    String guessFile = null;
    String goldFile = null;
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
    for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
        if (opt.getKey() == null)
            continue;
        if (opt.getKey().equals("-l")) {
            Language lang = Language.valueOf(opt.getValue()[0].trim());
            tlpp = lang.params;
        } else if (opt.getKey().equals("-y")) {
            maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
        } else if (opt.getKey().equals("-v")) {
            VERBOSE = true;
        } else if (opt.getKey().equals("-e")) {
            encoding = opt.getValue()[0];
        } else {
            log.info(usage.toString());
            System.exit(-1);
        }
        //Non-option arguments located at key null
        String[] rest = argsMap.get(null);
        if (rest == null || rest.length < minArgs) {
            log.info(usage.toString());
            System.exit(-1);
        }
        goldFile = rest[0];
        guessFile = rest[1];
    }
    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder());
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        evalGuess.indexLeaves(true);
        final Tree evalGold = tc.transformTree(goldTree);
        evalGold.indexLeaves(true);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) Map(java.util.Map) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 14 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class FactoredLexicon method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 4) {
        System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
        System.exit(-1);
    }
    // Command line options
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    Treebank trainTreebank = tlpp.diskTreebank();
    trainTreebank.loadPath(args[2]);
    Treebank devTreebank = tlpp.diskTreebank();
    devTreebank.loadPath(args[3]);
    MorphoFeatureSpecification morphoSpec;
    Options options = getOptions(language);
    if (language.equals(Language.Arabic)) {
        morphoSpec = new ArabicMorphoFeatureSpecification();
        String[] languageOptions = { "-arabicFactored" };
        tlpp.setOptionFlag(languageOptions, 0);
    } else if (language.equals(Language.French)) {
        morphoSpec = new FrenchMorphoFeatureSpecification();
        String[] languageOptions = { "-frenchFactored" };
        tlpp.setOptionFlag(languageOptions, 0);
    } else {
        throw new UnsupportedOperationException();
    }
    String featureList = args[1];
    String[] features = featureList.trim().split(",");
    for (String feature : features) {
        morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    System.out.println("Language: " + language.toString());
    System.out.println("Features: " + args[1]);
    // Create word and tag indices
    // Save trees in a collection since the interface requires that....
    System.out.print("Loading training trees...");
    List<Tree> trainTrees = new ArrayList<>(19000);
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    for (Tree tree : trainTreebank) {
        for (Tree subTree : tree) {
            if (!subTree.isLeaf()) {
                tlpp.transformTree(subTree, tree);
            }
        }
        trainTrees.add(tree);
    }
    System.out.printf("Done! (%d trees)%n", trainTrees.size());
    // Setup and train the lexicon.
    System.out.print("Collecting sufficient statistics for lexicon...");
    FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
    lexicon.initializeTraining(trainTrees.size());
    lexicon.train(trainTrees, null);
    lexicon.finishTraining();
    System.out.println("Done!");
    trainTrees = null;
    // Load the tuning set
    System.out.print("Loading tuning set...");
    List<FactoredLexiconEvent> tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
    System.out.printf("...Done! (%d events)%n", tuningSet.size());
    // Print the probabilities that we obtain
    // TODO(spenceg): Implement tagging accuracy with FactLex
    int nCorrect = 0;
    Counter<String> errors = new ClassicCounter<>();
    for (FactoredLexiconEvent event : tuningSet) {
        Iterator<IntTaggedWord> itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
        Counter<Integer> logScores = new ClassicCounter<>();
        boolean noRules = true;
        int goldTagId = -1;
        while (itr.hasNext()) {
            noRules = false;
            IntTaggedWord iTW = itr.next();
            if (iTW.tag() == event.tagId()) {
                log.info("GOLD-");
                goldTagId = iTW.tag();
            }
            float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
            logScores.incrementCount(iTW.tag(), tagScore);
        }
        if (noRules) {
            System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
        } else {
            // Score the tagging
            int hypTagId = Counters.argmax(logScores);
            if (hypTagId == goldTagId) {
                ++nCorrect;
            } else {
                String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
                errors.incrementCount(goldTag);
            }
        }
        log.info();
    }
    // Output accuracy
    double acc = (double) nCorrect / (double) tuningSet.size();
    System.err.printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
    log.info("% of errors by type:");
    List<String> biggestKeys = new ArrayList<>(errors.keySet());
    Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
    Counters.normalize(errors);
    for (String key : biggestKeys) {
        System.err.printf("%s\t%.2f%n", key, errors.getCount(key) * 100.0);
    }
}
Also used : FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) Treebank(edu.stanford.nlp.trees.Treebank) ArrayList(java.util.ArrayList) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) HashIndex(edu.stanford.nlp.util.HashIndex) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Example 15 with Language

use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.

the class Evalb method main.

/**
   * Run the Evalb scoring metric on guess/gold input. The default language is English.
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    final TreebankLangParserParams tlpp = language.params;
    final int maxGoldYield = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    final boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    final boolean sortByF1 = PropertiesUtils.hasProperty(options, "s");
    int worstKTreesToEmit = PropertiesUtils.getInt(options, "s", 0);
    PriorityQueue<Triple<Double, Tree, Tree>> queue = sortByF1 ? new PriorityQueue<>(2000, new F1Comparator()) : null;
    boolean doCatLevel = PropertiesUtils.getBool(options, "c", false);
    String labelRegex = options.getProperty("f", null);
    String encoding = options.getProperty("e", "UTF-8");
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != minArgs) {
        log.info(usage());
        System.exit(-1);
    }
    String goldFile = parsedArgs[0];
    String guessFile = parsedArgs[1];
    // Command-line has been parsed. Configure the metric for evaluation.
    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final Evalb metric = new Evalb("Evalb LP/LR", true);
    final EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
        if (doCatLevel)
            evalbCat.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
        if (sortByF1)
            storeTrees(queue, guessTree, goldTree, metric.getLastF1());
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.println();
    if (doCatLevel) {
        evalbCat.display(true, pwOut);
        pwOut.println();
    }
    if (sortByF1)
        emitSortedTrees(queue, worstKTreesToEmit, guessFile);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Triple(edu.stanford.nlp.util.Triple) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Aggregations

Language (edu.stanford.nlp.international.Language)15 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)14 Tree (edu.stanford.nlp.trees.Tree)14 PrintWriter (java.io.PrintWriter)12 ArrayList (java.util.ArrayList)8 Label (edu.stanford.nlp.ling.Label)7 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)7 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)7 Treebank (edu.stanford.nlp.trees.Treebank)7 Properties (java.util.Properties)6 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)5 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 Map (java.util.Map)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)2 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)2 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)2 HashIndex (edu.stanford.nlp.util.HashIndex)2 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1