Search in sources :

Example 1 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class TsarfatyEval method main.

/**
   * Run the scoring metric on guess/gold input. This method performs "Collinization." 
   * The default language is English.
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    int maxGuessYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    boolean skipGuess = false;
    boolean tagMode = false;
    String guessFile = null;
    String goldFile = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-y":
                    maxGoldYield = Integer.parseInt(args[++i].trim());
                    break;
                case "-t":
                    tagMode = true;
                    break;
                case "-v":
                    VERBOSE = true;
                    break;
                case "-g":
                    maxGuessYield = Integer.parseInt(args[++i].trim());
                    skipGuess = true;
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            //Required parameters
            goldFile = args[i++];
            guessFile = args[i];
            break;
        }
    }
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
    final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        final ArrayList<Label> guessSent = guess.yield();
        final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
        if (guessSent.size() > maxGuessYield) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            final ArrayList<Label> goldSent = gold.yield();
            final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
            if (goldSent.size() > maxGoldYield) {
                continue;
            } else if (goldChars.length() != guessChars.length()) {
                pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    eval.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 2 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class CollinsDepEval method main.

/**
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    Language LANGUAGE = PropertiesUtils.get(options, "l", Language.English, Language.class);
    int MAX_GOLD_YIELD = PropertiesUtils.getInt(options, "g", Integer.MAX_VALUE);
    int MAX_GUESS_YIELD = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    File goldFile = new File(parsedArgs[0]);
    File guessFile = new File(parsedArgs[1]);
    final TreebankLangParserParams tlpp = LANGUAGE.params;
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        if (guess.yield().size() > MAX_GUESS_YIELD) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            if (gold.yield().size() > MAX_GOLD_YIELD) {
                continue;
            } else if (evalGold.yield().size() != evalGuess.yield().size()) {
                pwOut.println("Yield mismatch at gold line " + goldLineId);
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            depEval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((MAX_GUESS_YIELD < Integer.MAX_VALUE) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    depEval.display(true, pwOut);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) File(java.io.File) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 3 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class LeafAncestorEval method main.

/**
   * Execute with no arguments for usage.
   */
public static void main(String[] args) {
    if (!validateCommandLine(args)) {
        log.info(USAGE);
        System.exit(-1);
    }
    final TreebankLangParserParams tlpp = LANGUAGE.params;
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final LeafAncestorEval metric = new LeafAncestorEval("LeafAncestor");
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > MAX_GOLD_YIELD) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) Tree(edu.stanford.nlp.trees.Tree) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 4 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class UniversalDependenciesFeatureAnnotator method treebankIterator.

private static Iterator<Tree> treebankIterator(String path) {
    /* Remove empty nodes and strip indices from internal nodes but keep
       functional tags. */
    Treebank tb = new MemoryTreebank(new NPTmpRetainingTreeNormalizer(0, false, 1, false));
    tb.loadPath(path);
    return tb.iterator();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) MemoryTreebank(edu.stanford.nlp.trees.MemoryTreebank) MemoryTreebank(edu.stanford.nlp.trees.MemoryTreebank) NPTmpRetainingTreeNormalizer(edu.stanford.nlp.trees.NPTmpRetainingTreeNormalizer)

Example 5 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class NoPunctuationHeadFinder method main.

public static void main(String[] args) {
    // simple testing code
    Treebank treebank = new DiskTreebank();
    CategoryWordTag.suppressTerminalDetails = true;
    treebank.loadPath(args[0]);
    final HeadFinder chf = new NoPunctuationHeadFinder();
    treebank.apply(pt -> {
        pt.percolateHeads(chf);
        pt.pennPrint();
        System.out.println();
    });
}
Also used : ModCollinsHeadFinder(edu.stanford.nlp.trees.ModCollinsHeadFinder) HeadFinder(edu.stanford.nlp.trees.HeadFinder) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Treebank(edu.stanford.nlp.trees.Treebank)

Aggregations

Treebank (edu.stanford.nlp.trees.Treebank)27 Tree (edu.stanford.nlp.trees.Tree)16 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)10 ArrayList (java.util.ArrayList)8 Language (edu.stanford.nlp.international.Language)7 EvaluateTreebank (edu.stanford.nlp.parser.lexparser.EvaluateTreebank)7 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)7 Pair (edu.stanford.nlp.util.Pair)7 PrintWriter (java.io.PrintWriter)7 Label (edu.stanford.nlp.ling.Label)6 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)6 FileFilter (java.io.FileFilter)6 Map (java.util.Map)4 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)3 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)3 MemoryTreebank (edu.stanford.nlp.trees.MemoryTreebank)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)2 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)2 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)2