Search in sources :

Example 6 with EvaluateTreebank

use of edu.stanford.nlp.parser.metrics.EvaluateTreebank in project CoreNLP by stanfordnlp.

the class ShiftReduceParser method main.

public static void main(String[] args) {
    List<String> remainingArgs = Generics.newArrayList();
    List<Pair<String, FileFilter>> trainTreebankPath = null;
    Pair<String, FileFilter> testTreebankPath = null;
    Pair<String, FileFilter> devTreebankPath = null;
    String serializedPath = null;
    String tlppClass = null;
    String continueTraining = null;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            if (trainTreebankPath == null) {
                trainTreebankPath = Generics.newArrayList();
            }
            trainTreebankPath.add(ArgUtils.getTreebankDescription(args, argIndex, "-trainTreebank"));
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
        } else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
            testTreebankPath = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
        } else if (args[argIndex].equalsIgnoreCase("-devTreebank")) {
            devTreebankPath = ArgUtils.getTreebankDescription(args, argIndex, "-devTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
        } else if (args[argIndex].equalsIgnoreCase("-serializedPath") || args[argIndex].equalsIgnoreCase("-model")) {
            serializedPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tlpp")) {
            tlppClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-continueTraining")) {
            continueTraining = args[argIndex + 1];
            argIndex += 2;
        } else {
            remainingArgs.add(args[argIndex]);
            ++argIndex;
        }
    }
    String[] newArgs = new String[remainingArgs.size()];
    newArgs = remainingArgs.toArray(newArgs);
    if (trainTreebankPath == null && serializedPath == null) {
        throw new IllegalArgumentException("Must specify a treebank to train from with -trainTreebank or a parser to load with -serializedPath");
    }
    ShiftReduceParser parser = null;
    if (trainTreebankPath != null) {
        log.info("Training ShiftReduceParser");
        log.info("Initial arguments:");
        log.info("   " + StringUtils.join(args));
        if (continueTraining != null) {
            parser = ShiftReduceParser.loadModel(continueTraining, ArrayUtils.concatenate(BASIC_TRAINING_OPTIONS, newArgs));
        } else {
            ShiftReduceOptions op = buildTrainingOptions(tlppClass, newArgs);
            parser = new ShiftReduceParser(op);
        }
        Timing trainingTimer = new Timing();
        parser.train(trainTreebankPath, devTreebankPath, serializedPath);
        trainingTimer.done("Overall training process");
        parser.saveModel(serializedPath);
    }
    if (serializedPath != null && parser == null) {
        parser = ShiftReduceParser.loadModel(serializedPath, ArrayUtils.concatenate(FORCE_TAGS, newArgs));
    }
    if (testTreebankPath != null) {
        log.info("Loading test trees from " + testTreebankPath.first());
        Treebank testTreebank = parser.op.tlpParams.memoryTreebank();
        testTreebank.loadPath(testTreebankPath.first(), testTreebankPath.second());
        log.info("Loaded " + testTreebank.size() + " trees");
        EvaluateTreebank evaluator = new EvaluateTreebank(parser.op, null, parser);
        evaluator.testOnTreebank(testTreebank);
    // log.info("Input tree: " + tree);
    // log.info("Debinarized tree: " + query.getBestParse());
    // log.info("Parsed binarized tree: " + query.getBestBinarizedParse());
    // log.info("Predicted transition sequence: " + query.getBestTransitionSequence());
    }
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) Timing(edu.stanford.nlp.util.Timing) FileFilter(java.io.FileFilter) Pair(edu.stanford.nlp.util.Pair)

Example 7 with EvaluateTreebank

use of edu.stanford.nlp.parser.metrics.EvaluateTreebank in project CoreNLP by stanfordnlp.

the class LexicalizedParser method main.

/**
 * A main program for using the parser with various options.
 * This program can be used for building and serializing
 * a parser from treebank data, for parsing sentences from a file
 * or URL using a serialized or text grammar parser,
 * and (mainly for parser quality testing)
 * for training and testing a parser on a treebank all in one go.
 *
 * <p>
 * Sample Usages:
 * <ul>
 *   <li> <b>Train a parser (saved to <i>serializedGrammarFilename</i>)
 *      from a directory of trees (<i>trainFilesPath</i>, with an optional <i>fileRange</i>, e.g., 0-1000):</b>
 *    {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -saveToSerializedFile serializedGrammarFilename}
 *   </li>
 *
 *   <li> <b>Train a parser (not saved) from a directory of trees, and test it (reporting scores) on a directory of trees</b>
 *    {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -testTreebank testFilePath [fileRange] }
 *   </li>
 *
 *   <li> <b>Parse one or more files, given a serialized grammar and a list of files</b>
 *    {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] serializedGrammarPath filename [filename]*}
 *   </li>
 *
 *   <li> <b>Test and report scores for a serialized grammar on trees in an output directory</b>
 *    {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -loadFromSerializedFile serializedGrammarPath -testTreebank testFilePath [fileRange]}
 *   </li>
 * </ul>
 *
 *<p>
 * If the {@code serializedGrammarPath} ends in {@code .gz},
 * then the grammar is written and read as a compressed file (GZip).
 * If the {@code serializedGrammarPath} is a URL, starting with
 * {@code http://}, then the parser is read from the URL.
 * A fileRange specifies a numeric value that must be included within a
 * filename for it to be used in training or testing (this works well with
 * most current treebanks).  It can be specified like a range of pages to be
 * printed, for instance as {@code 200-2199} or
 * {@code 1-300,500-725,9000} or just as {@code 1} (if all your
 * trees are in a single file, either omit this parameter or just give a dummy
 * argument such as {@code 0}).
 * If the filename to parse is "-" then the parser parses from stdin.
 * If no files are supplied to parse, then a hardwired sentence
 * is parsed.
 *
 * <p>
 * The parser can write a grammar as either a serialized Java object file
 * or in a text format (or as both), specified with the following options:
 * <blockquote>{@code
 * java edu.stanford.nlp.parser.lexparser.LexicalizedParser
 * [-v] -train
 * trainFilesPath [fileRange] [-saveToSerializedFile grammarPath]
 * [-saveToTextFile grammarPath]
 * }</blockquote>
 *
 * <p>
 * In the same position as the verbose flag ({@code -v}), many other
 * options can be specified.  The most useful to an end user are:
 * <ul>
 * <LI>{@code -tLPP class} Specify a different
 * TreebankLangParserParams, for when using a different language or
 * treebank (the default is English Penn Treebank). <i>This option MUST occur
 * before any other language-specific options that are used (or else they
 * are ignored!).</i>
 * (It's usually a good idea to specify this option even when loading a
 * serialized grammar; it is necessary if the language pack specifies a
 * needed character encoding or you wish to specify language-specific
 * options on the command line.)</LI>
 * <LI>{@code -encoding charset} Specify the character encoding of the
 * input and output files.  This will override the value in the
 * {@code TreebankLangParserParams}, provided this option appears
 * <i>after</i> any {@code -tLPP} option.</LI>
 * <LI>{@code -tokenized} Says that the input is already separated
 * into whitespace-delimited tokens.  If this option is specified, any
 * tokenizer specified for the language is ignored, and a universal (Unicode)
 * tokenizer, which divides only on whitespace, is used.
 * Unless you also specify
 * {@code -escaper}, the tokens <i>must</i> all be correctly
 * tokenized tokens of the appropriate treebank for the parser to work
 * well (for instance, if using the Penn English Treebank, you must have
 * coded "(" as "-LRB-", etc.). (Note: we do not use the backslash escaping
 * in front of / and * that appeared in Penn Treebank releases through 1999.)</li>
 * <li>{@code -escaper class} Specify a class of type
 * {@link Function}&lt;List&lt;HasWord&gt;,List&lt;HasWord&gt;&gt; to do
 * customized escaping of tokenized text.  This class will be run over the
 * tokenized text and can fix the representation of tokens. For instance,
 * it could change "(" to "-LRB-" for the Penn English Treebank.  A
 * provided escaper that does such things for the Penn English Treebank is
 * {@code edu.stanford.nlp.process.PTBEscapingProcessor}
 * <li>{@code -tokenizerFactory class} Specifies a
 * TokenizerFactory class to be used for tokenization</li>
 * <li>{@code -tokenizerOptions options} Specifies options to a
 * TokenizerFactory class to be used for tokenization.   A comma-separated
 * list. For PTBTokenizer, options of interest include
 * {@code americanize=false} and {@code quotes=ascii} (for German).
 * Note that any choice of tokenizer options that conflicts with the
 * tokenization used in the parser training data will likely degrade parser
 * performance. </li>
 * <li>{@code -sentences token } Specifies a token that marks sentence
 * boundaries.  A value of {@code newline} causes sentence breaking on
 * newlines.  A value of {@code onePerElement} causes each element
 * (using the XML {@code -parseInside} option) to be treated as a
 * sentence. All other tokens will be interpreted literally, and must be
 * exactly the same as tokens returned by the tokenizer.  For example,
 * you might specify "|||" and put that symbol sequence as a token between
 * sentences.
 * If no explicit sentence breaking option is chosen, sentence breaking
 * is done based on a set of language-particular sentence-ending patterns.
 * </li>
 * <LI>{@code -parseInside element} Specifies that parsing should only
 * be done for tokens inside the indicated XML-style
 * elements (done as simple pattern matching, rather than XML parsing).
 * For example, if this is specified as {@code sentence}, then
 * the text inside the {@code sentence} element
 * would be parsed.
 * Using "-parseInside s" gives you support for the input format of
 * Charniak's parser. Sentences cannot span elements. Whether the
 * contents of the element are treated as one sentence or potentially
 * multiple sentences is controlled by the {@code -sentences} flag.
 * The default is potentially multiple sentences.
 * This option gives support for extracting and parsing
 * text from very simple SGML and XML documents, and is provided as a
 * user convenience for that purpose. If you want to really parse XML
 * documents before NLP parsing them, you should use an XML parser, and then
 * call to a LexicalizedParser on appropriate CDATA.
 * <LI>{@code -tagSeparator char} Specifies to look for tags on words
 * following the word and separated from it by a special character
 * {@code char}.  For instance, many tagged corpora have the
 * representation "house/NN" and you would use {@code -tagSeparator /}.
 * Notes: This option requires that the input be pretokenized.
 * The separator has to be only a single character, and there is no
 * escaping mechanism. However, splitting is done on the <i>last</i>
 * instance of the character in the token, so that cases like
 * "3\/4/CD" are handled correctly.  The parser will in all normal
 * circumstances use the tag you provide, but will override it in the
 * case of very common words in cases where the tag that you provide
 * is not one that it regards as a possible tagging for the word.
 * The parser supports a format where only some of the words in a sentence
 * have a tag (if you are calling the parser programmatically, you indicate
 * them by having them implement the {@code HasTag} interface).
 * You can do this at the command-line by only having tags after some words,
 * but you are limited by the fact that there is no way to escape the
 * tagSeparator character.</LI>
 * <LI>{@code -maxLength leng} Specify the longest sentence that
 * will be parsed (and hence indirectly the amount of memory
 * needed for the parser). If this is not specified, the parser will
 * try to dynamically grow its parse chart when long sentence are
 * encountered, but may run out of memory trying to do so.</LI>
 * <LI>{@code -outputFormat styles} Choose the style(s) of output
 * sentences: {@code penn} for prettyprinting as in the Penn
 * treebank files, or {@code oneline} for printing sentences one
 * per line, {@code words}, {@code wordsAndTags},
 * {@code dependencies}, {@code typedDependencies},
 * or {@code typedDependenciesCollapsed}.
 * Multiple options may be specified as a comma-separated
 * list.  See TreePrint class for further documentation.</LI>
 * <LI>{@code -outputFormatOptions} Provide options that control the
 * behavior of various {@code -outputFormat} choices, such as
 * {@code lexicalize}, {@code stem}, {@code markHeadNodes},
 * or {@code xml}.  {@link edu.stanford.nlp.trees.TreePrint}
 * Options are specified as a comma-separated list.</LI>
 * <LI>{@code -writeOutputFiles} Write output files corresponding
 * to the input files, with the same name but a {@code ".stp"}
 * file extension.  The format of these files depends on the
 * {@code outputFormat} option.  (If not specified, output is sent
 * to stdout.)</LI>
 * <LI>{@code -outputFilesExtension} The extension that is appended to
 * the filename that is being parsed to produce an output file name (with the
 * -writeOutputFiles option). The default is {@code stp}.  Don't
 * include the period.
 * <LI>{@code -outputFilesDirectory} The directory in which output
 * files are written (when the -writeOutputFiles option is specified).
 * If not specified, output files are written in the same directory as the
 * input files.
 * <LI>{@code -nthreads} Parsing files and testing on treebanks
 * can use multiple threads.  This option tells the parser how many
 * threads to use.  A negative number indicates to use as many
 * threads as the machine has cores.
 * </ul>
 * See also the package documentation for more details and examples of use.
 *
 * @param args Command line arguments, as above
 */
public static void main(String[] args) {
    boolean train = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPath = null;
    Treebank testTreebank = null;
    Treebank tuneTreebank = null;
    String testPath = null;
    FileFilter testFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilter = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;
    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    // whether or not the input file has already been tokenized
    boolean tokenized = false;
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }
    Options op = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encoding = null;
    // while loop through option arguments
    while (argIndex < args.length && args[argIndex].charAt(0) == '-' && !args[argIndex].equals("-")) {
        // single - represents parse from stdin
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            train = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPath = treebankDescription.first();
            trainFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-train2")) {
            // train = true;     // cdm july 2005: should require -train for this
            Triple<String, FileFilter, Double> treebankDescription = ArgUtils.getWeightedTreebankDescription(args, argIndex, "-train2");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            secondaryTreebankPath = treebankDescription.first();
            secondaryTrainFilter = treebankDescription.second();
            secondaryTreebankWeight = treebankDescription.third();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).getDeclaredConstructor().newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (NoSuchMethodException e) {
                log.info("Method not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException | InvocationTargetException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encoding = args[argIndex + 1];
            op.tlpParams.setInputEncoding(encoding);
            op.tlpParams.setOutputEncoding(encoding);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {
            tokenized = true;
            argIndex += 1;
        } else if (args[argIndex].equalsIgnoreCase("-escaper")) {
            try {
                escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);
            } catch (Exception e) {
                log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {
            tokenizerOptions = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {
            tokenizerFactoryClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {
            tokenizerMethod = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentences")) {
            sentenceDelimiter = args[argIndex + 1];
            if (sentenceDelimiter.equalsIgnoreCase("newline")) {
                sentenceDelimiter = "\n";
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {
            elementDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {
            tagDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) {
            // load the parser from a binary serialized file
            // the next argument must be the path to the parser file
            serializedInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
            // load the parser from declarative text file
            // the next argument must be the path to the parser file
            textInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
            saveToSerializedFile = true;
            if (ArgUtils.numSubArgs(args, argIndex) < 1) {
                log.info("Missing path: -saveToSerialized filename");
            } else {
                serializedOutputFileOrUrl = args[argIndex + 1];
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
            // save the parser to declarative text file
            saveToTextFile = true;
            textOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {
            // save the training trees to a binary file
            op.trainOptions.trainTreeFile = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPath = treebankDescription.first();
            testFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tune")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            tunePath = treebankDescription.first();
            tuneFilter = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = op.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    }
    if (tuneFilter != null || tunePath != null) {
        if (tunePath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No tune treebank path specified...");
            } else {
                log.info("No tune treebank path specified.  Using train path: \"" + treebankPath + '\"');
                tunePath = treebankPath;
            }
        }
        tuneTreebank = op.tlpParams.testMemoryTreebank();
        tuneTreebank.loadPath(tunePath, tuneFilter);
    }
    if (!train && op.testOptions.verbose) {
        StringUtils.logInvocationString(log, args);
    }
    // always initialized in next if-then-else block
    LexicalizedParser lp;
    if (train) {
        StringUtils.logInvocationString(log, args);
        // so we train a parser using the treebank
        GrammarCompactor compactor = null;
        if (op.trainOptions.compactGrammar() == 3) {
            compactor = new ExactGrammarCompactor(op, false, false);
        }
        Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
        Treebank secondaryTrainTreebank = null;
        if (secondaryTreebankPath != null) {
            secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);
        }
        List<List<TaggedWord>> extraTaggedWords = null;
        if (op.trainOptions.taggedFiles != null) {
            extraTaggedWords = new ArrayList<>();
            List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles);
            for (TaggedFileRecord record : fileRecords) {
                for (List<TaggedWord> sentence : record.reader()) {
                    extraTaggedWords.add(sentence);
                }
            }
        }
        lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords);
    } else if (textInputFileOrUrl != null) {
        // so we load the parser from a text grammar file
        lp = getParserFromTextFile(textInputFileOrUrl, op);
    } else {
        // so we load a serialized parser
        if (serializedInputFileOrUrl == null && argIndex < args.length) {
            // the next argument must be the path to the serialized parser
            serializedInputFileOrUrl = args[argIndex];
            argIndex++;
        }
        if (serializedInputFileOrUrl == null) {
            log.info("No grammar specified, exiting...");
            return;
        }
        String[] extraArgs = new String[optionArgs.size()];
        extraArgs = optionArgs.toArray(extraArgs);
        try {
            lp = loadModel(serializedInputFileOrUrl, op, extraArgs);
            op = lp.op;
        } catch (IllegalArgumentException e) {
            log.info("Error loading parser, exiting...");
            throw e;
        }
    }
    // set up tokenizerFactory with options if provided
    if (tokenizerFactoryClass != null || tokenizerOptions != null) {
        try {
            if (tokenizerFactoryClass != null) {
                Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(tokenizerFactoryClass));
                Method factoryMethod;
                if (tokenizerOptions != null) {
                    factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class);
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));
                } else {
                    factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory");
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));
                }
            } else {
                // have options but no tokenizer factory.  use the parser
                // langpack's factory and set its options
                tokenizerFactory = lp.op.langpack().getTokenizerFactory();
                tokenizerFactory.setOptions(tokenizerOptions);
            }
        } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) {
            log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions);
            throw new RuntimeException(e);
        }
    }
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encoding != null) {
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
    }
    if (testFilter != null || testPath != null) {
        if (testPath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPath + '\"');
                testPath = treebankPath;
            }
        }
        testTreebank = op.tlpParams.testMemoryTreebank();
        testTreebank.loadPath(testPath, testFilter);
    }
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    // Now what do we do with the parser we've made
    if (saveToTextFile) {
        // save the parser to textGrammar format
        if (textOutputFileOrUrl != null) {
            lp.saveParserToTextFile(textOutputFileOrUrl);
        } else {
            log.info("Usage: must specify a text grammar output path");
        }
    }
    if (saveToSerializedFile) {
        if (serializedOutputFileOrUrl != null) {
            lp.saveParserToSerialized(serializedOutputFileOrUrl);
        } else if (textOutputFileOrUrl == null && testTreebank == null) {
            // no saving/parsing request has been specified
            log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");
        }
    }
    if (op.testOptions.verbose || train) {
        // Tell the user a little or a lot about what we have made
        // get lexicon size separately as it may have its own prints in it....
        String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";
        log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
        log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t' + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules);
        log.info("ParserPack is " + op.tlpParams.getClass().getName());
        log.info("Lexicon is " + lp.lex.getClass().getName());
        if (op.testOptions.verbose) {
            log.info("Tags are: " + lp.tagIndex);
        // log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)!
        }
        printOptions(false, op);
    }
    if (testTreebank != null) {
        // test parser on treebank
        EvaluateTreebank evaluator = new EvaluateTreebank(lp);
        evaluator.testOnTreebank(testTreebank);
    } else if (argIndex >= args.length) {
        // no more arguments, so we just parse our own test sentence
        PrintWriter pwOut = op.tlpParams.pw();
        PrintWriter pwErr = op.tlpParams.pw(System.err);
        ParserQuery pq = lp.parserQuery();
        if (pq.parse(op.tlpParams.defaultTestSentence())) {
            lp.getTreePrint().printTree(pq.getBestParse(), pwOut);
        } else {
            pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence());
        }
    } else {
        // We parse filenames given by the remaining arguments
        ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp);
    }
}
Also used : EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) TaggedFileRecord(edu.stanford.nlp.tagger.io.TaggedFileRecord) Pair(edu.stanford.nlp.util.Pair) HasWord(edu.stanford.nlp.ling.HasWord) TokenizerFactory(edu.stanford.nlp.process.TokenizerFactory) Method(java.lang.reflect.Method) InvocationTargetException(java.lang.reflect.InvocationTargetException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) InvocationTargetException(java.lang.reflect.InvocationTargetException) Triple(edu.stanford.nlp.util.Triple) TaggedWord(edu.stanford.nlp.ling.TaggedWord) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery)

Example 8 with EvaluateTreebank

use of edu.stanford.nlp.parser.metrics.EvaluateTreebank in project CoreNLP by stanfordnlp.

the class CombineDVModels method main.

public static void main(String[] args) throws IOException, ClassNotFoundException {
    String modelPath = null;
    List<String> baseModelPaths = null;
    String testTreebankPath = null;
    FileFilter testTreebankFilter = null;
    List<String> unusedArgs = new ArrayList<>();
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-model")) {
            modelPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testTreebankPath = treebankDescription.first();
            testTreebankFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-baseModels")) {
            argIndex++;
            baseModelPaths = new ArrayList<>();
            while (argIndex < args.length && args[argIndex].charAt(0) != '-') {
                baseModelPaths.add(args[argIndex++]);
            }
            if (baseModelPaths.size() == 0) {
                throw new IllegalArgumentException("Found an argument -baseModels with no actual models named");
            }
        } else {
            unusedArgs.add(args[argIndex++]);
        }
    }
    String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
    LexicalizedParser underlyingParser = null;
    Options options = null;
    LexicalizedParser combinedParser = null;
    if (baseModelPaths != null) {
        List<DVModel> dvparsers = new ArrayList<>();
        for (String baseModelPath : baseModelPaths) {
            log.info("Loading serialized DVParser from " + baseModelPath);
            LexicalizedParser dvparser = LexicalizedParser.loadModel(baseModelPath);
            Reranker reranker = dvparser.reranker;
            if (!(reranker instanceof DVModelReranker)) {
                throw new IllegalArgumentException("Expected parsers with DVModel embedded");
            }
            dvparsers.add(((DVModelReranker) reranker).getModel());
            if (underlyingParser == null) {
                underlyingParser = dvparser;
                options = underlyingParser.getOp();
                // TODO: other parser's options?
                options.setOptions(newArgs);
            }
            log.info("... done");
        }
        combinedParser = LexicalizedParser.copyLexicalizedParser(underlyingParser);
        CombinedDVModelReranker reranker = new CombinedDVModelReranker(options, dvparsers);
        combinedParser.reranker = reranker;
        combinedParser.saveParserToSerialized(modelPath);
    } else {
        throw new IllegalArgumentException("Need to specify -model to load an already prepared CombinedParser");
    }
    Treebank testTreebank = null;
    if (testTreebankPath != null) {
        log.info("Reading in trees from " + testTreebankPath);
        if (testTreebankFilter != null) {
            log.info("Filtering on " + testTreebankFilter);
        }
        testTreebank = combinedParser.getOp().tlpParams.memoryTreebank();
        ;
        testTreebank.loadPath(testTreebankPath, testTreebankFilter);
        log.info("Read in " + testTreebank.size() + " trees for testing");
        EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.getOp(), null, combinedParser);
        evaluator.testOnTreebank(testTreebank);
    }
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) Reranker(edu.stanford.nlp.parser.lexparser.Reranker) Treebank(edu.stanford.nlp.trees.Treebank) EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) ArrayList(java.util.ArrayList) EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) FileFilter(java.io.FileFilter) Pair(edu.stanford.nlp.util.Pair)

Aggregations

EvaluateTreebank (edu.stanford.nlp.parser.metrics.EvaluateTreebank)8 Pair (edu.stanford.nlp.util.Pair)5 Treebank (edu.stanford.nlp.trees.Treebank)4 FileFilter (java.io.FileFilter)4 ArrayList (java.util.ArrayList)4 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)3 Tree (edu.stanford.nlp.trees.Tree)2 Timing (edu.stanford.nlp.util.Timing)2 NumberRangeFileFilter (edu.stanford.nlp.io.NumberRangeFileFilter)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 HasWord (edu.stanford.nlp.ling.HasWord)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 ParserQuery (edu.stanford.nlp.parser.common.ParserQuery)1 Options (edu.stanford.nlp.parser.lexparser.Options)1 Reranker (edu.stanford.nlp.parser.lexparser.Reranker)1 TokenizerFactory (edu.stanford.nlp.process.TokenizerFactory)1 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)1 CompositeTreeTransformer (edu.stanford.nlp.trees.CompositeTreeTransformer)1 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)1 Triple (edu.stanford.nlp.util.Triple)1