Search in sources :

Example 1 with SynchronizedTreeTransformer

use of edu.stanford.nlp.trees.SynchronizedTreeTransformer in project CoreNLP by stanfordnlp.

the class CacheParseHypotheses method main.

/**
   * An example of a command line is
   * <br>
   * java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
   * <br>
   * java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
   * <br>
   * java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
   */
public static void main(String[] args) throws IOException {
    String parserModel = null;
    String output = null;
    List<Pair<String, FileFilter>> treebanks = Generics.newArrayList();
    int dvKBest = 200;
    int numThreads = 1;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-dvKBest")) {
            dvKBest = Integer.valueOf(args[argIndex + 1]);
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-parser") || args[argIndex].equals("-model")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-output")) {
            output = args[argIndex + 1];
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-treebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-treebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebanks.add(treebankDescription);
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-numThreads")) {
            numThreads = Integer.valueOf(args[argIndex + 1]);
            argIndex += 2;
            continue;
        }
        throw new IllegalArgumentException("Unknown argument " + args[argIndex]);
    }
    if (parserModel == null) {
        throw new IllegalArgumentException("Need to supply a parser model with -model");
    }
    if (output == null) {
        throw new IllegalArgumentException("Need to supply an output filename with -output");
    }
    if (treebanks.size() == 0) {
        throw new IllegalArgumentException("Need to supply a treebank with -treebank");
    }
    log.info("Writing output to " + output);
    log.info("Loading parser model " + parserModel);
    log.info("Writing " + dvKBest + " hypothesis trees for each tree");
    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel, "-dvKBest", Integer.toString(dvKBest));
    CacheParseHypotheses cacher = new CacheParseHypotheses(parser);
    TreeTransformer transformer = DVParser.buildTrainTransformer(parser.getOp());
    List<Tree> sentences = new ArrayList<>();
    for (Pair<String, FileFilter> description : treebanks) {
        log.info("Reading trees from " + description.first);
        Treebank treebank = parser.getOp().tlpParams.memoryTreebank();
        treebank.loadPath(description.first, description.second);
        treebank = treebank.transform(transformer);
        sentences.addAll(treebank);
    }
    log.info("Processing " + sentences.size() + " trees");
    List<Pair<Tree, byte[]>> cache = Generics.newArrayList();
    transformer = new SynchronizedTreeTransformer(transformer);
    MulticoreWrapper<Tree, Pair<Tree, byte[]>> wrapper = new MulticoreWrapper<>(numThreads, new CacheProcessor(cacher, parser, dvKBest, transformer));
    for (Tree tree : sentences) {
        wrapper.put(tree);
        while (wrapper.peek()) {
            cache.add(wrapper.poll());
            if (cache.size() % 10 == 0) {
                System.out.println("Processed " + cache.size() + " trees");
            }
        }
    }
    wrapper.join();
    while (wrapper.peek()) {
        cache.add(wrapper.poll());
        if (cache.size() % 10 == 0) {
            System.out.println("Processed " + cache.size() + " trees");
        }
    }
    System.out.println("Finished processing " + cache.size() + " trees");
    IOUtils.writeObjectToFile(cache, output);
}
Also used : MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) Treebank(edu.stanford.nlp.trees.Treebank) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) ArrayList(java.util.ArrayList) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) SynchronizedTreeTransformer(edu.stanford.nlp.trees.SynchronizedTreeTransformer) BasicCategoryTreeTransformer(edu.stanford.nlp.trees.BasicCategoryTreeTransformer) Pair(edu.stanford.nlp.util.Pair) SynchronizedTreeTransformer(edu.stanford.nlp.trees.SynchronizedTreeTransformer)

Aggregations

LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)1 BasicCategoryTreeTransformer (edu.stanford.nlp.trees.BasicCategoryTreeTransformer)1 SynchronizedTreeTransformer (edu.stanford.nlp.trees.SynchronizedTreeTransformer)1 Tree (edu.stanford.nlp.trees.Tree)1 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)1 Treebank (edu.stanford.nlp.trees.Treebank)1 Pair (edu.stanford.nlp.util.Pair)1 MulticoreWrapper (edu.stanford.nlp.util.concurrent.MulticoreWrapper)1 ArrayList (java.util.ArrayList)1