Search in sources :

Example 16 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class JointParsingModel method run.

public boolean run(File trainTreebankFile, File testTreebankFile, InputStream inputStream) {
    op = new Options();
    op.tlpParams = new ArabicTreebankParserParams();
    op.setOptions("-arabicFactored");
    op.testOptions.maxLength = maxSentLen;
    //500000 is the default for Arabic, but we have substantially more edges now
    op.testOptions.MAX_ITEMS = 5000000;
    op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies";
    // WSG: Just set this to some high value so that extractBestParse()
    // actually calls the lattice reader (e.g., this says that we can't have a word longer than
    // 80 characters...seems sensible for Arabic
    op.testOptions.maxSpanForTags = 80;
    treePrint = op.testOptions.treePrint(op.tlpParams);
    debinarizer = new Debinarizer(op.forceCNF, new CategoryWordTagFactory());
    subcategoryStripper = op.tlpParams.subcategoryStripper();
    Timing.startTime();
    final Treebank trainTreebank = op.tlpParams.diskTreebank();
    trainTreebank.loadPath(trainTreebankFile);
    lp = getParserDataFromTreebank(trainTreebank);
    makeParsers();
    if (VERBOSE) {
        op.display();
        String lexNumRules = (pparser != null) ? Integer.toString(lp.lex.numRules()) : "";
        log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
        log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (pparser != null ? lp.ug.numRules() : "") + '\t' + (pparser != null ? lp.bg.numRules() : "") + '\t' + lexNumRules);
        log.info("ParserPack is " + op.tlpParams.getClass().getName());
        log.info("Lexicon is " + lp.lex.getClass().getName());
    }
    return parse(inputStream);
}
Also used : CategoryWordTagFactory(edu.stanford.nlp.ling.CategoryWordTagFactory) Treebank(edu.stanford.nlp.trees.Treebank)

Example 17 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class CacheParseHypotheses method main.

/**
   * An example of a command line is
   * <br>
   * java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
   * <br>
   * java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
   * <br>
   * java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
   */
public static void main(String[] args) throws IOException {
    String parserModel = null;
    String output = null;
    List<Pair<String, FileFilter>> treebanks = Generics.newArrayList();
    int dvKBest = 200;
    int numThreads = 1;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-dvKBest")) {
            dvKBest = Integer.valueOf(args[argIndex + 1]);
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-parser") || args[argIndex].equals("-model")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-output")) {
            output = args[argIndex + 1];
            argIndex += 2;
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-treebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-treebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebanks.add(treebankDescription);
            continue;
        }
        if (args[argIndex].equalsIgnoreCase("-numThreads")) {
            numThreads = Integer.valueOf(args[argIndex + 1]);
            argIndex += 2;
            continue;
        }
        throw new IllegalArgumentException("Unknown argument " + args[argIndex]);
    }
    if (parserModel == null) {
        throw new IllegalArgumentException("Need to supply a parser model with -model");
    }
    if (output == null) {
        throw new IllegalArgumentException("Need to supply an output filename with -output");
    }
    if (treebanks.size() == 0) {
        throw new IllegalArgumentException("Need to supply a treebank with -treebank");
    }
    log.info("Writing output to " + output);
    log.info("Loading parser model " + parserModel);
    log.info("Writing " + dvKBest + " hypothesis trees for each tree");
    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel, "-dvKBest", Integer.toString(dvKBest));
    CacheParseHypotheses cacher = new CacheParseHypotheses(parser);
    TreeTransformer transformer = DVParser.buildTrainTransformer(parser.getOp());
    List<Tree> sentences = new ArrayList<>();
    for (Pair<String, FileFilter> description : treebanks) {
        log.info("Reading trees from " + description.first);
        Treebank treebank = parser.getOp().tlpParams.memoryTreebank();
        treebank.loadPath(description.first, description.second);
        treebank = treebank.transform(transformer);
        sentences.addAll(treebank);
    }
    log.info("Processing " + sentences.size() + " trees");
    List<Pair<Tree, byte[]>> cache = Generics.newArrayList();
    transformer = new SynchronizedTreeTransformer(transformer);
    MulticoreWrapper<Tree, Pair<Tree, byte[]>> wrapper = new MulticoreWrapper<>(numThreads, new CacheProcessor(cacher, parser, dvKBest, transformer));
    for (Tree tree : sentences) {
        wrapper.put(tree);
        while (wrapper.peek()) {
            cache.add(wrapper.poll());
            if (cache.size() % 10 == 0) {
                System.out.println("Processed " + cache.size() + " trees");
            }
        }
    }
    wrapper.join();
    while (wrapper.peek()) {
        cache.add(wrapper.poll());
        if (cache.size() % 10 == 0) {
            System.out.println("Processed " + cache.size() + " trees");
        }
    }
    System.out.println("Finished processing " + cache.size() + " trees");
    IOUtils.writeObjectToFile(cache, output);
}
Also used : MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) Treebank(edu.stanford.nlp.trees.Treebank) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) ArrayList(java.util.ArrayList) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) SynchronizedTreeTransformer(edu.stanford.nlp.trees.SynchronizedTreeTransformer) BasicCategoryTreeTransformer(edu.stanford.nlp.trees.BasicCategoryTreeTransformer) Pair(edu.stanford.nlp.util.Pair) SynchronizedTreeTransformer(edu.stanford.nlp.trees.SynchronizedTreeTransformer)

Example 18 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class CombineDVModels method main.

public static void main(String[] args) throws IOException, ClassNotFoundException {
    String modelPath = null;
    List<String> baseModelPaths = null;
    String testTreebankPath = null;
    FileFilter testTreebankFilter = null;
    List<String> unusedArgs = new ArrayList<>();
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-model")) {
            modelPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testTreebankPath = treebankDescription.first();
            testTreebankFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-baseModels")) {
            argIndex++;
            baseModelPaths = new ArrayList<>();
            while (argIndex < args.length && args[argIndex].charAt(0) != '-') {
                baseModelPaths.add(args[argIndex++]);
            }
            if (baseModelPaths.size() == 0) {
                throw new IllegalArgumentException("Found an argument -baseModels with no actual models named");
            }
        } else {
            unusedArgs.add(args[argIndex++]);
        }
    }
    String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
    LexicalizedParser underlyingParser = null;
    Options options = null;
    LexicalizedParser combinedParser = null;
    if (baseModelPaths != null) {
        List<DVModel> dvparsers = new ArrayList<>();
        for (String baseModelPath : baseModelPaths) {
            log.info("Loading serialized DVParser from " + baseModelPath);
            LexicalizedParser dvparser = LexicalizedParser.loadModel(baseModelPath);
            Reranker reranker = dvparser.reranker;
            if (!(reranker instanceof DVModelReranker)) {
                throw new IllegalArgumentException("Expected parsers with DVModel embedded");
            }
            dvparsers.add(((DVModelReranker) reranker).getModel());
            if (underlyingParser == null) {
                underlyingParser = dvparser;
                options = underlyingParser.getOp();
                // TODO: other parser's options?
                options.setOptions(newArgs);
            }
            log.info("... done");
        }
        combinedParser = LexicalizedParser.copyLexicalizedParser(underlyingParser);
        CombinedDVModelReranker reranker = new CombinedDVModelReranker(options, dvparsers);
        combinedParser.reranker = reranker;
        combinedParser.saveParserToSerialized(modelPath);
    } else {
        throw new IllegalArgumentException("Need to specify -model to load an already prepared CombinedParser");
    }
    Treebank testTreebank = null;
    if (testTreebankPath != null) {
        log.info("Reading in trees from " + testTreebankPath);
        if (testTreebankFilter != null) {
            log.info("Filtering on " + testTreebankFilter);
        }
        testTreebank = combinedParser.getOp().tlpParams.memoryTreebank();
        ;
        testTreebank.loadPath(testTreebankPath, testTreebankFilter);
        log.info("Read in " + testTreebank.size() + " trees for testing");
        EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.getOp(), null, combinedParser);
        evaluator.testOnTreebank(testTreebank);
    }
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) Reranker(edu.stanford.nlp.parser.lexparser.Reranker) EvaluateTreebank(edu.stanford.nlp.parser.lexparser.EvaluateTreebank) Treebank(edu.stanford.nlp.trees.Treebank) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) ArrayList(java.util.ArrayList) EvaluateTreebank(edu.stanford.nlp.parser.lexparser.EvaluateTreebank) FileFilter(java.io.FileFilter) Pair(edu.stanford.nlp.util.Pair)

Example 19 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class BaseLexicon method main.

/** Provides some testing and opportunities for exploration of the
   *  probabilities of a BaseLexicon.  What's here currently probably
   *  only works for the English Penn Treeebank, as it uses default
   *  constructors.  Of the words given to test on,
   *  the first is treated as sentence initial, and the rest as not
   *  sentence initial.
   *
   *  @param args The command line arguments:
   *     java BaseLexicon treebankPath fileRange unknownWordModel words*
   */
public static void main(String[] args) {
    if (args.length < 3) {
        log.info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
        return;
    }
    System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
    Treebank tb = new DiskTreebank();
    tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
    // TODO: change this interface so the lexicon creates its own indices?
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    Options op = new Options();
    op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
    BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
    lex.initializeTraining(tb.size());
    lex.train(tb);
    lex.finishTraining();
    System.out.println("done.");
    System.out.println();
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(4);
    List<String> impos = new ArrayList<>();
    for (int i = 3; i < args.length; i++) {
        if (lex.isKnown(args[i])) {
            System.out.println(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
            for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
                IntTaggedWord iTW = it.next();
                System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
            }
        } else {
            String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
            System.out.println(args[i] + " is an unknown word.  Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
            impos.clear();
            List<String> lis = new ArrayList<>(tagIndex.objectsList());
            Collections.sort(lis);
            for (String tStr : lis) {
                IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                double score = lex.score(iTW, 1, args[i], null);
                if (score == Float.NEGATIVE_INFINITY) {
                    impos.add(tStr);
                } else {
                    System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
                }
            }
            if (impos.size() > 0) {
                System.out.println(args[i] + " impossible tags: " + impos);
            }
        }
        System.out.println();
    }
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Treebank(edu.stanford.nlp.trees.Treebank) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) NumberFormat(java.text.NumberFormat)

Example 20 with Treebank

use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.

the class ShiftReduceParser method train.

private void train(List<Pair<String, FileFilter>> trainTreebankPath, Pair<String, FileFilter> devTreebankPath, String serializedPath) {
    log.info("Training method: " + op.trainOptions().trainingMethod);
    List<Tree> binarizedTrees = Generics.newArrayList();
    for (Pair<String, FileFilter> treebank : trainTreebankPath) {
        binarizedTrees.addAll(readBinarizedTreebank(treebank.first(), treebank.second()));
    }
    int nThreads = op.trainOptions.trainingThreads;
    nThreads = nThreads <= 0 ? Runtime.getRuntime().availableProcessors() : nThreads;
    Tagger tagger = null;
    if (op.testOptions.preTag) {
        Timing retagTimer = new Timing();
        tagger = Tagger.loadModel(op.testOptions.taggerSerializedFile);
        redoTags(binarizedTrees, tagger, nThreads);
        retagTimer.done("Retagging");
    }
    Set<String> knownStates = findKnownStates(binarizedTrees);
    Set<String> rootStates = findRootStates(binarizedTrees);
    Set<String> rootOnlyStates = findRootOnlyStates(binarizedTrees, rootStates);
    log.info("Known states: " + knownStates);
    log.info("States which occur at the root: " + rootStates);
    log.info("States which only occur at the root: " + rootStates);
    Timing transitionTimer = new Timing();
    List<List<Transition>> transitionLists = CreateTransitionSequence.createTransitionSequences(binarizedTrees, op.compoundUnaries, rootStates, rootOnlyStates);
    Index<Transition> transitionIndex = new HashIndex<>();
    for (List<Transition> transitions : transitionLists) {
        transitionIndex.addAll(transitions);
    }
    transitionTimer.done("Converting trees into transition lists");
    log.info("Number of transitions: " + transitionIndex.size());
    Random random = new Random(op.trainOptions.randomSeed);
    Treebank devTreebank = null;
    if (devTreebankPath != null) {
        devTreebank = readTreebank(devTreebankPath.first(), devTreebankPath.second());
    }
    PerceptronModel newModel = new PerceptronModel(this.op, transitionIndex, knownStates, rootStates, rootOnlyStates);
    newModel.trainModel(serializedPath, tagger, random, binarizedTrees, transitionLists, devTreebank, nThreads);
    this.model = newModel;
}
Also used : Tagger(edu.stanford.nlp.tagger.common.Tagger) Treebank(edu.stanford.nlp.trees.Treebank) EvaluateTreebank(edu.stanford.nlp.parser.lexparser.EvaluateTreebank) HashIndex(edu.stanford.nlp.util.HashIndex) Random(java.util.Random) Tree(edu.stanford.nlp.trees.Tree) List(java.util.List) Timing(edu.stanford.nlp.util.Timing) FileFilter(java.io.FileFilter)

Aggregations

Treebank (edu.stanford.nlp.trees.Treebank)27 Tree (edu.stanford.nlp.trees.Tree)16 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)10 ArrayList (java.util.ArrayList)8 Language (edu.stanford.nlp.international.Language)7 EvaluateTreebank (edu.stanford.nlp.parser.lexparser.EvaluateTreebank)7 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)7 Pair (edu.stanford.nlp.util.Pair)7 PrintWriter (java.io.PrintWriter)7 Label (edu.stanford.nlp.ling.Label)6 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)6 FileFilter (java.io.FileFilter)6 Map (java.util.Map)4 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)3 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)3 MemoryTreebank (edu.stanford.nlp.trees.MemoryTreebank)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)2 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)2 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)2