Search in sources :

Example 11 with TreeTransformer

use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.

the class CacheParseHypotheses method main.

/**
   * An example of a command line is
   * <br>
   * java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
   * <br>
   * java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
   * <br>
   * java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
   */
public static void main(String[] args) throws IOException {
    String parserModel = null;
    String output = null;
    List<Pair<String, FileFilter>> treebanks = Generics.newArrayList();
    int dvKBest = 200;
    int numThreads = 1;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-dvKBest")) {
            dvKBest = Integer.valueOf(args[argIndex + 1]);
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-parser") || args[argIndex].equals("-model")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-output")) {
            output = args[argIndex + 1];
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-treebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-treebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebanks.add(treebankDescription);
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-numThreads")) {
            numThreads = Integer.valueOf(args[argIndex + 1]);
            argIndex += 2;
            continue;
        }
        throw new IllegalArgumentException("Unknown argument " + args[argIndex]);
    }
    if (parserModel == null) {
        throw new IllegalArgumentException("Need to supply a parser model with -model");
    }
    if (output == null) {
        throw new IllegalArgumentException("Need to supply an output filename with -output");
    }
    if (treebanks.size() == 0) {
        throw new IllegalArgumentException("Need to supply a treebank with -treebank");
    }
    log.info("Writing output to " + output);
    log.info("Loading parser model " + parserModel);
    log.info("Writing " + dvKBest + " hypothesis trees for each tree");
    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel, "-dvKBest", Integer.toString(dvKBest));
    CacheParseHypotheses cacher = new CacheParseHypotheses(parser);
    TreeTransformer transformer = DVParser.buildTrainTransformer(parser.getOp());
    List<Tree> sentences = new ArrayList<>();
    for (Pair<String, FileFilter> description : treebanks) {
        log.info("Reading trees from " + description.first);
        Treebank treebank = parser.getOp().tlpParams.memoryTreebank();
        treebank.loadPath(description.first, description.second);
        treebank = treebank.transform(transformer);
        sentences.addAll(treebank);
    }
    log.info("Processing " + sentences.size() + " trees");
    List<Pair<Tree, byte[]>> cache = Generics.newArrayList();
    transformer = new SynchronizedTreeTransformer(transformer);
    MulticoreWrapper<Tree, Pair<Tree, byte[]>> wrapper = new MulticoreWrapper<>(numThreads, new CacheProcessor(cacher, parser, dvKBest, transformer));
    for (Tree tree : sentences) {
        wrapper.put(tree);
        while (wrapper.peek()) {
            cache.add(wrapper.poll());
            if (cache.size() % 10 == 0) {
                System.out.println("Processed " + cache.size() + " trees");
            }
        }
    }
    wrapper.join();
    while (wrapper.peek()) {
        cache.add(wrapper.poll());
        if (cache.size() % 10 == 0) {
            System.out.println("Processed " + cache.size() + " trees");
        }
    }
    System.out.println("Finished processing " + cache.size() + " trees");
    IOUtils.writeObjectToFile(cache, output);
}
Also used : MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) Treebank(edu.stanford.nlp.trees.Treebank) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) ArrayList(java.util.ArrayList) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) SynchronizedTreeTransformer(edu.stanford.nlp.trees.SynchronizedTreeTransformer) BasicCategoryTreeTransformer(edu.stanford.nlp.trees.BasicCategoryTreeTransformer) Pair(edu.stanford.nlp.util.Pair) SynchronizedTreeTransformer(edu.stanford.nlp.trees.SynchronizedTreeTransformer)

Example 12 with TreeTransformer

use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.

the class TaggingEval method main.

/**
   * Run the scoring metric on guess/gold input. This method performs "Collinization."
   * The default language is English.
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    String encoding = "UTF-8";
    String guessFile = null;
    String goldFile = null;
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
    for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
        if (opt.getKey() == null)
            continue;
        if (opt.getKey().equals("-l")) {
            Language lang = Language.valueOf(opt.getValue()[0].trim());
            tlpp = lang.params;
        } else if (opt.getKey().equals("-y")) {
            maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
        } else if (opt.getKey().equals("-v")) {
            VERBOSE = true;
        } else if (opt.getKey().equals("-c")) {
            TaggingEval.doCatLevelEval = true;
        } else if (opt.getKey().equals("-e")) {
            encoding = opt.getValue()[0];
        } else {
            log.info(usage.toString());
            System.exit(-1);
        }
        //Non-option arguments located at key null
        String[] rest = argsMap.get(null);
        if (rest == null || rest.length < minArgs) {
            log.info(usage.toString());
            System.exit(-1);
        }
        goldFile = rest[0];
        guessFile = rest[1];
    }
    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final TaggingEval metric = new TaggingEval("Tagging LP/LR");
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeMap(java.util.TreeMap) Map(java.util.Map) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 13 with TreeTransformer

use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.

the class UnlabeledAttachmentEval method main.

/**
   * Run the Evalb scoring metric on guess/gold input. The default language is English.
   *
   * @param args
   */
public static void main(String[] args) {
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    String encoding = "UTF-8";
    String guessFile = null;
    String goldFile = null;
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
    for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
        if (opt.getKey() == null)
            continue;
        if (opt.getKey().equals("-l")) {
            Language lang = Language.valueOf(opt.getValue()[0].trim());
            tlpp = lang.params;
        } else if (opt.getKey().equals("-y")) {
            maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
        } else if (opt.getKey().equals("-v")) {
            VERBOSE = true;
        } else if (opt.getKey().equals("-e")) {
            encoding = opt.getValue()[0];
        } else {
            log.info(usage.toString());
            System.exit(-1);
        }
        //Non-option arguments located at key null
        String[] rest = argsMap.get(null);
        if (rest == null || rest.length < minArgs) {
            log.info(usage.toString());
            System.exit(-1);
        }
        goldFile = rest[0];
        guessFile = rest[1];
    }
    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder());
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        evalGuess.indexLeaves(true);
        final Tree evalGold = tc.transformTree(goldTree);
        evalGold.indexLeaves(true);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) Map(java.util.Map) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 14 with TreeTransformer

use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.

the class Evalb method main.

/**
   * Run the Evalb scoring metric on guess/gold input. The default language is English.
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    final TreebankLangParserParams tlpp = language.params;
    final int maxGoldYield = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    final boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    final boolean sortByF1 = PropertiesUtils.hasProperty(options, "s");
    int worstKTreesToEmit = PropertiesUtils.getInt(options, "s", 0);
    PriorityQueue<Triple<Double, Tree, Tree>> queue = sortByF1 ? new PriorityQueue<>(2000, new F1Comparator()) : null;
    boolean doCatLevel = PropertiesUtils.getBool(options, "c", false);
    String labelRegex = options.getProperty("f", null);
    String encoding = options.getProperty("e", "UTF-8");
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != minArgs) {
        log.info(usage());
        System.exit(-1);
    }
    String goldFile = parsedArgs[0];
    String guessFile = parsedArgs[1];
    // Command-line has been parsed. Configure the metric for evaluation.
    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final Evalb metric = new Evalb("Evalb LP/LR", true);
    final EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
        if (doCatLevel)
            evalbCat.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
        if (sortByF1)
            storeTrees(queue, guessTree, goldTree, metric.getLastF1());
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.println();
    if (doCatLevel) {
        evalbCat.display(true, pwOut);
        pwOut.println();
    }
    if (sortByF1)
        emitSortedTrees(queue, worstKTreesToEmit, guessFile);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Triple(edu.stanford.nlp.util.Triple) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 15 with TreeTransformer

use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.

the class GrammarCoverageChecker method testOnTreebank.

private void testOnTreebank(LexicalizedParser pd, TreebankLangParserParams tlpParams, Treebank testTreebank, String treebankRoot, Index<String> stateIndex) {
    Timing.startTime();
    TreeTransformer annotator = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
    // CDM: Aug 2004: With new implementation of treebank split categories,
    // I've hardwired this to load English ones.  Otherwise need training data.
    // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters()));
    op.trainOptions.splitters = ParentAnnotationStats.getEnglishSplitCategories(treebankRoot);
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    for (Tree goldTree : testTreebank) {
        goldTree = annotator.transformTree(goldTree);
        //      System.out.println("Checking tree: " + goldTree);
        for (Tree localTree : goldTree) {
            // now try to use the grammar to score this local tree
            if (localTree.isLeaf() || localTree.isPreTerminal() || localTree.children().length < 2) {
                continue;
            }
            System.out.println(localTreeToRule(localTree));
            double score = computeLocalTreeScore(localTree, stateIndex, pd);
            if (score == Double.NEGATIVE_INFINITY) {
            //          System.out.println(localTreeToRule(localTree));
            }
            System.out.println("score: " + score);
        }
    }
}
Also used : Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Aggregations

TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)18 Tree (edu.stanford.nlp.trees.Tree)16 Treebank (edu.stanford.nlp.trees.Treebank)10 PrintWriter (java.io.PrintWriter)9 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)6 Language (edu.stanford.nlp.international.Language)5 Label (edu.stanford.nlp.ling.Label)5 ArrayList (java.util.ArrayList)4 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)3 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)3 Pair (edu.stanford.nlp.util.Pair)3 NumberRangesFileFilter (edu.stanford.nlp.io.NumberRangesFileFilter)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 HasWord (edu.stanford.nlp.ling.HasWord)2 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)2 CompositeTreeTransformer (edu.stanford.nlp.trees.CompositeTreeTransformer)2 MemoryTreebank (edu.stanford.nlp.trees.MemoryTreebank)2 TreeReader (edu.stanford.nlp.trees.TreeReader)2 HashIndex (edu.stanford.nlp.util.HashIndex)2 File (java.io.File)2