Search in sources :

Example 6 with ParserQuery

use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.

the class ParseAndPrintMatrices method main.

public static void main(String[] args) throws IOException {
    String modelPath = null;
    String outputPath = null;
    String inputPath = null;
    String testTreebankPath = null;
    FileFilter testTreebankFilter = null;
    List<String> unusedArgs = Generics.newArrayList();
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-model")) {
            modelPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-output")) {
            outputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-input")) {
            inputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testTreebankPath = treebankDescription.first();
            testTreebankFilter = treebankDescription.second();
        } else {
            unusedArgs.add(args[argIndex++]);
        }
    }
    String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
    LexicalizedParser parser = LexicalizedParser.loadModel(modelPath, newArgs);
    DVModel model = DVParser.getModelFromLexicalizedParser(parser);
    File outputFile = new File(outputPath);
    FileSystem.checkNotExistsOrFail(outputFile);
    FileSystem.mkdirOrFail(outputFile);
    int count = 0;
    if (inputPath != null) {
        Reader input = new BufferedReader(new FileReader(inputPath));
        DocumentPreprocessor processor = new DocumentPreprocessor(input);
        for (List<HasWord> sentence : processor) {
            // index from 1
            count++;
            ParserQuery pq = parser.parserQuery();
            if (!(pq instanceof RerankingParserQuery)) {
                throw new IllegalArgumentException("Expected a RerankingParserQuery");
            }
            RerankingParserQuery rpq = (RerankingParserQuery) pq;
            if (!rpq.parse(sentence)) {
                throw new RuntimeException("Unparsable sentence: " + sentence);
            }
            RerankerQuery reranker = rpq.rerankerQuery();
            if (!(reranker instanceof DVModelReranker.Query)) {
                throw new IllegalArgumentException("Expected a DVModelReranker");
            }
            DeepTree deepTree = ((DVModelReranker.Query) reranker).getDeepTrees().get(0);
            IdentityHashMap<Tree, SimpleMatrix> vectors = deepTree.getVectors();
            for (Map.Entry<Tree, SimpleMatrix> entry : vectors.entrySet()) {
                log.info(entry.getKey() + "   " + entry.getValue());
            }
            FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
            BufferedWriter bout = new BufferedWriter(fout);
            bout.write(SentenceUtils.listToString(sentence));
            bout.newLine();
            bout.write(deepTree.getTree().toString());
            bout.newLine();
            for (HasWord word : sentence) {
                outputMatrix(bout, model.getWordVector(word.word()));
            }
            Tree rootTree = findRootTree(vectors);
            outputTreeMatrices(bout, rootTree, vectors);
            bout.flush();
            fout.close();
        }
    }
}
Also used : RerankerQuery(edu.stanford.nlp.parser.lexparser.RerankerQuery) RerankingParserQuery(edu.stanford.nlp.parser.lexparser.RerankingParserQuery) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) FileWriter(java.io.FileWriter) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) BufferedWriter(java.io.BufferedWriter) SimpleMatrix(org.ejml.simple.SimpleMatrix) DeepTree(edu.stanford.nlp.trees.DeepTree) Tree(edu.stanford.nlp.trees.Tree) FileReader(java.io.FileReader) DeepTree(edu.stanford.nlp.trees.DeepTree) FileFilter(java.io.FileFilter) RerankingParserQuery(edu.stanford.nlp.parser.lexparser.RerankingParserQuery) Pair(edu.stanford.nlp.util.Pair) HasWord(edu.stanford.nlp.ling.HasWord) RerankerQuery(edu.stanford.nlp.parser.lexparser.RerankerQuery) BufferedReader(java.io.BufferedReader) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) File(java.io.File) Map(java.util.Map) IdentityHashMap(java.util.IdentityHashMap) RerankingParserQuery(edu.stanford.nlp.parser.lexparser.RerankingParserQuery) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery)

Example 7 with ParserQuery

use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.

the class EvaluateTreebank method testOnTreebank.

/** Test the parser on a treebank. Parses will be written to stdout, and
   *  various other information will be written to stderr and stdout,
   *  particularly if <code>op.testOptions.verbose</code> is true.
   *
   *  @param testTreebank The treebank to parse
   *  @return The labeled precision/recall F<sub>1</sub> (EVALB measure)
   *          of the parser on the treebank.
   */
public double testOnTreebank(Treebank testTreebank) {
    log.info("Testing on treebank");
    Timing treebankTotalTimer = new Timing();
    TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = op.langpack();
    PrintWriter pwOut, pwErr;
    if (op.testOptions.quietEvaluation) {
        NullOutputStream quiet = new NullOutputStream();
        pwOut = tlpParams.pw(quiet);
        pwErr = tlpParams.pw(quiet);
    } else {
        pwOut = tlpParams.pw();
        pwErr = tlpParams.pw(System.err);
    }
    if (op.testOptions.verbose) {
        pwErr.print("Testing ");
        pwErr.println(testTreebank.textualSummary(tlp));
    }
    if (op.testOptions.evalb) {
        EvalbFormatWriter.initEVALBfiles(tlpParams);
    }
    PrintWriter pwFileOut = null;
    if (op.testOptions.writeOutputFiles) {
        String fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
        try {
            pwFileOut = op.tlpParams.pw(new FileOutputStream(fname));
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
    PrintWriter pwStats = null;
    if (op.testOptions.outputkBestEquivocation != null) {
        try {
            pwStats = op.tlpParams.pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
    if (op.testOptions.testingThreads != 1) {
        MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
        LinkedList<Tree> goldTrees = new LinkedList<>();
        for (Tree goldTree : testTreebank) {
            List<? extends HasWord> sentence = getInputSentence(goldTree);
            goldTrees.add(goldTree);
            pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
            wrapper.put(sentence);
            while (wrapper.peek()) {
                ParserQuery pq = wrapper.poll();
                goldTree = goldTrees.poll();
                processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
            }
        }
        // for tree iterator
        wrapper.join();
        while (wrapper.peek()) {
            ParserQuery pq = wrapper.poll();
            Tree goldTree = goldTrees.poll();
            processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
        }
    } else {
        ParserQuery pq = pqFactory.parserQuery();
        for (Tree goldTree : testTreebank) {
            final List<CoreLabel> sentence = getInputSentence(goldTree);
            pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
            pq.parseAndReport(sentence, pwErr);
            processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
        }
    // for tree iterator
    }
    //Done parsing...print the results of the evaluations
    treebankTotalTimer.done("Testing on treebank");
    if (op.testOptions.quietEvaluation) {
        pwErr = tlpParams.pw(System.err);
    }
    if (saidMemMessage) {
        ParserUtils.printOutOfMemory(pwErr);
    }
    if (op.testOptions.evalb) {
        EvalbFormatWriter.closeEVALBfiles();
    }
    if (numSkippedEvals != 0) {
        pwErr.printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
    }
    // only created here so we know what parser types are supported...
    ParserQuery pq = pqFactory.parserQuery();
    if (summary) {
        if (pcfgLB != null)
            pcfgLB.display(false, pwErr);
        if (pcfgChildSpecific != null)
            pcfgChildSpecific.display(false, pwErr);
        if (pcfgLA != null)
            pcfgLA.display(false, pwErr);
        if (pcfgCB != null)
            pcfgCB.display(false, pwErr);
        if (pcfgDA != null)
            pcfgDA.display(false, pwErr);
        if (pcfgTA != null)
            pcfgTA.display(false, pwErr);
        if (pcfgLL != null && pq.getPCFGParser() != null)
            pcfgLL.display(false, pwErr);
        if (depDA != null)
            depDA.display(false, pwErr);
        if (depTA != null)
            depTA.display(false, pwErr);
        if (depLL != null && pq.getDependencyParser() != null)
            depLL.display(false, pwErr);
        if (factLB != null)
            factLB.display(false, pwErr);
        if (factChildSpecific != null)
            factChildSpecific.display(false, pwErr);
        if (factLA != null)
            factLA.display(false, pwErr);
        if (factCB != null)
            factCB.display(false, pwErr);
        if (factDA != null)
            factDA.display(false, pwErr);
        if (factTA != null)
            factTA.display(false, pwErr);
        if (factLL != null && pq.getFactoredParser() != null)
            factLL.display(false, pwErr);
        if (pcfgCatE != null)
            pcfgCatE.display(false, pwErr);
        for (Eval eval : evals) {
            eval.display(false, pwErr);
        }
        for (BestOfTopKEval eval : topKEvals) {
            eval.display(false, pwErr);
        }
    }
    // these ones only have a display mode, so display if turned on!!
    if (pcfgRUO != null)
        pcfgRUO.display(true, pwErr);
    if (pcfgCUO != null)
        pcfgCUO.display(true, pwErr);
    if (tsv) {
        NumberFormat nf = new DecimalFormat("0.00");
        pwErr.println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
        if (factLB != null)
            pwErr.print(nf.format(factLB.getEvalbF1Percent()));
        pwErr.print("\t");
        if (pq.getDependencyParser() != null && factDA != null)
            pwErr.print(nf.format(factDA.getEvalbF1Percent()));
        pwErr.print("\t");
        if (factLB != null)
            pwErr.print(nf.format(factLB.getExactPercent()));
        pwErr.print("\t");
        if (pcfgLB != null)
            pwErr.print(nf.format(pcfgLB.getEvalbF1Percent()));
        pwErr.print("\t");
        if (pq.getDependencyParser() != null && depDA != null)
            pwErr.print(nf.format(depDA.getEvalbF1Percent()));
        pwErr.print("\t");
        if (pq.getPCFGParser() != null && factTA != null)
            pwErr.print(nf.format(factTA.getEvalbF1Percent()));
        pwErr.print("\t");
        if (factLB != null)
            pwErr.print(factLB.getNum());
        pwErr.println();
    }
    double f1 = 0.0;
    if (factLB != null) {
        f1 = factLB.getEvalbF1();
    }
    //Close files (if necessary)
    if (pwFileOut != null)
        pwFileOut.close();
    if (pwStats != null)
        pwStats.close();
    if (parserQueryEvals != null) {
        for (ParserQueryEval parserQueryEval : parserQueryEvals) {
            parserQueryEval.display(false, pwErr);
        }
    }
    return f1;
}
Also used : DecimalFormat(java.text.DecimalFormat) TreePrint(edu.stanford.nlp.trees.TreePrint) Tree(edu.stanford.nlp.trees.Tree) TreebankLanguagePack(edu.stanford.nlp.trees.TreebankLanguagePack) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) LeafAncestorEval(edu.stanford.nlp.parser.metrics.LeafAncestorEval) AbstractEval(edu.stanford.nlp.parser.metrics.AbstractEval) TaggingEval(edu.stanford.nlp.parser.metrics.TaggingEval) TopMatchEval(edu.stanford.nlp.parser.metrics.TopMatchEval) FilteredEval(edu.stanford.nlp.parser.metrics.FilteredEval) Eval(edu.stanford.nlp.parser.metrics.Eval) UnlabeledAttachmentEval(edu.stanford.nlp.parser.metrics.UnlabeledAttachmentEval) BestOfTopKEval(edu.stanford.nlp.parser.metrics.BestOfTopKEval) ParserQueryEval(edu.stanford.nlp.parser.metrics.ParserQueryEval) PrintWriter(java.io.PrintWriter) MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) IOException(java.io.IOException) ParserQueryEval(edu.stanford.nlp.parser.metrics.ParserQueryEval) LinkedList(java.util.LinkedList) ParsingThreadsafeProcessor(edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor) FileOutputStream(java.io.FileOutputStream) Timing(edu.stanford.nlp.util.Timing) BestOfTopKEval(edu.stanford.nlp.parser.metrics.BestOfTopKEval) NullOutputStream(edu.stanford.nlp.io.NullOutputStream) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) NumberFormat(java.text.NumberFormat)

Example 8 with ParserQuery

use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.

the class LexicalizedParserITest method testConstraints.

/**
 * Test what happens if you put a constraint on the parse
 */
@Test
public void testConstraints() {
    List<CoreLabel> sentence = sampleSausage();
    ParserQuery pq = englishParser.parserQuery();
    ParserConstraint constraint = new ParserConstraint(0, 2, "INTJ");
    List<ParserConstraint> constraints = new ArrayList<>();
    constraints.add(constraint);
    pq.setConstraints(constraints);
    pq.parse(sentence);
    StringWriter sw = new StringWriter();
    pennPrint.printTree(pq.getBestParse(), (new PrintWriter(sw)));
    String actualOutput = sw.toString().replaceAll("\\s+", " ").trim();
    String expectedOutput = "(ROOT (S (NP (PRP$ My) (NN dog)) (ADVP (RB also)) (VP (VBZ likes) (S (VP (VBG eating) (NP (NN sausage))))) (. .)))";
    expectedOutput = expectedOutput.replaceAll("\\s+", " ").trim();
    // Not exactly sure what should come back, but it shouldn't be the
    // original output any more
    assertFalse("Tree should not match the original tree any more", expectedOutput.equals(actualOutput));
    assertTrue("Tree should be forced to contain INTJ", actualOutput.contains("INTJ"));
// System.out.println(pq.getBestParse());
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) StringWriter(java.io.StringWriter) ArrayList(java.util.ArrayList) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 9 with ParserQuery

use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.

the class ShiftReduceParserITest method testBasicConstraint.

@Test
public void testBasicConstraint() {
    List<CoreLabel> sentence = SentenceUtils.toCoreLabelList("It", "was", "Carolina", "Reapers", ".");
    englishTagger.tagCoreLabels(sentence);
    Tree result = englishParser.apply(sentence);
    // pretty much need to make the test rely on the parser being consistent
    assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (NP (NNP Carolina) (NNPS Reapers))) (. .)))", result.toString());
    ParserConstraint constraint = new ParserConstraint(2, 4, ".*");
    List<ParserConstraint> constraints = Collections.singletonList(constraint);
    ParserQuery pq = englishParser.parserQuery();
    pq.setConstraints(constraints);
    assertTrue(pq.parse(sentence));
    result = pq.getBestParse();
    assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (NP (NNP Carolina) (NNPS Reapers))) (. .)))", result.toString());
    constraint = new ParserConstraint(2, 4, "NP");
    constraints = Collections.singletonList(constraint);
    pq = englishParser.parserQuery();
    pq.setConstraints(constraints);
    assertTrue(pq.parse(sentence));
    result = pq.getBestParse();
    assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (NP (NNP Carolina) (NNPS Reapers))) (. .)))", result.toString());
    // Note that since the constraints are introducing brackets which
    // don't exist, we may get some weird parse results as models
    // change in the future.  The important thing is that the ADJP
    // bracket appears for this test and the VP bracket appears for
    // the next test
    constraint = new ParserConstraint(2, 4, "ADJP");
    constraints = Collections.singletonList(constraint);
    pq = englishParser.parserQuery();
    pq.setConstraints(constraints);
    assertTrue(pq.parse(sentence));
    result = pq.getBestParse();
    assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (ADJP (NP (NNP Carolina) (NNPS Reapers)))) (. .)))", result.toString());
    constraint = new ParserConstraint(1, 3, "VP");
    constraints = Collections.singletonList(constraint);
    pq = englishParser.parserQuery();
    pq.setConstraints(constraints);
    assertTrue(pq.parse(sentence));
    result = pq.getBestParse();
    assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (ADJP (NNP Carolina))) (NP (NNPS Reapers)) (. .)))", result.toString());
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) Tree(edu.stanford.nlp.trees.Tree) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) Test(org.junit.Test)

Example 10 with ParserQuery

use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.

the class LexicalizedParser method main.

/**
 * A main program for using the parser with various options.
 * This program can be used for building and serializing
 * a parser from treebank data, for parsing sentences from a file
 * or URL using a serialized or text grammar parser,
 * and (mainly for parser quality testing)
 * for training and testing a parser on a treebank all in one go.
 *
 * <p>
 * Sample Usages:
 * <ul>
 *   <li> <b>Train a parser (saved to <i>serializedGrammarFilename</i>)
 *      from a directory of trees (<i>trainFilesPath</i>, with an optional <i>fileRange</i>, e.g., 0-1000):</b>
 *    {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -saveToSerializedFile serializedGrammarFilename}
 *   </li>
 *
 *   <li> <b>Train a parser (not saved) from a directory of trees, and test it (reporting scores) on a directory of trees</b>
 *    {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -testTreebank testFilePath [fileRange] }
 *   </li>
 *
 *   <li> <b>Parse one or more files, given a serialized grammar and a list of files</b>
 *    {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] serializedGrammarPath filename [filename]*}
 *   </li>
 *
 *   <li> <b>Test and report scores for a serialized grammar on trees in an output directory</b>
 *    {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -loadFromSerializedFile serializedGrammarPath -testTreebank testFilePath [fileRange]}
 *   </li>
 * </ul>
 *
 *<p>
 * If the {@code serializedGrammarPath} ends in {@code .gz},
 * then the grammar is written and read as a compressed file (GZip).
 * If the {@code serializedGrammarPath} is a URL, starting with
 * {@code http://}, then the parser is read from the URL.
 * A fileRange specifies a numeric value that must be included within a
 * filename for it to be used in training or testing (this works well with
 * most current treebanks).  It can be specified like a range of pages to be
 * printed, for instance as {@code 200-2199} or
 * {@code 1-300,500-725,9000} or just as {@code 1} (if all your
 * trees are in a single file, either omit this parameter or just give a dummy
 * argument such as {@code 0}).
 * If the filename to parse is "-" then the parser parses from stdin.
 * If no files are supplied to parse, then a hardwired sentence
 * is parsed.
 *
 * <p>
 * The parser can write a grammar as either a serialized Java object file
 * or in a text format (or as both), specified with the following options:
 * <blockquote>{@code
 * java edu.stanford.nlp.parser.lexparser.LexicalizedParser
 * [-v] -train
 * trainFilesPath [fileRange] [-saveToSerializedFile grammarPath]
 * [-saveToTextFile grammarPath]
 * }</blockquote>
 *
 * <p>
 * In the same position as the verbose flag ({@code -v}), many other
 * options can be specified.  The most useful to an end user are:
 * <ul>
 * <LI>{@code -tLPP class} Specify a different
 * TreebankLangParserParams, for when using a different language or
 * treebank (the default is English Penn Treebank). <i>This option MUST occur
 * before any other language-specific options that are used (or else they
 * are ignored!).</i>
 * (It's usually a good idea to specify this option even when loading a
 * serialized grammar; it is necessary if the language pack specifies a
 * needed character encoding or you wish to specify language-specific
 * options on the command line.)</LI>
 * <LI>{@code -encoding charset} Specify the character encoding of the
 * input and output files.  This will override the value in the
 * {@code TreebankLangParserParams}, provided this option appears
 * <i>after</i> any {@code -tLPP} option.</LI>
 * <LI>{@code -tokenized} Says that the input is already separated
 * into whitespace-delimited tokens.  If this option is specified, any
 * tokenizer specified for the language is ignored, and a universal (Unicode)
 * tokenizer, which divides only on whitespace, is used.
 * Unless you also specify
 * {@code -escaper}, the tokens <i>must</i> all be correctly
 * tokenized tokens of the appropriate treebank for the parser to work
 * well (for instance, if using the Penn English Treebank, you must have
 * coded "(" as "-LRB-", etc.). (Note: we do not use the backslash escaping
 * in front of / and * that appeared in Penn Treebank releases through 1999.)</li>
 * <li>{@code -escaper class} Specify a class of type
 * {@link Function}&lt;List&lt;HasWord&gt;,List&lt;HasWord&gt;&gt; to do
 * customized escaping of tokenized text.  This class will be run over the
 * tokenized text and can fix the representation of tokens. For instance,
 * it could change "(" to "-LRB-" for the Penn English Treebank.  A
 * provided escaper that does such things for the Penn English Treebank is
 * {@code edu.stanford.nlp.process.PTBEscapingProcessor}
 * <li>{@code -tokenizerFactory class} Specifies a
 * TokenizerFactory class to be used for tokenization</li>
 * <li>{@code -tokenizerOptions options} Specifies options to a
 * TokenizerFactory class to be used for tokenization.   A comma-separated
 * list. For PTBTokenizer, options of interest include
 * {@code americanize=false} and {@code quotes=ascii} (for German).
 * Note that any choice of tokenizer options that conflicts with the
 * tokenization used in the parser training data will likely degrade parser
 * performance. </li>
 * <li>{@code -sentences token } Specifies a token that marks sentence
 * boundaries.  A value of {@code newline} causes sentence breaking on
 * newlines.  A value of {@code onePerElement} causes each element
 * (using the XML {@code -parseInside} option) to be treated as a
 * sentence. All other tokens will be interpreted literally, and must be
 * exactly the same as tokens returned by the tokenizer.  For example,
 * you might specify "|||" and put that symbol sequence as a token between
 * sentences.
 * If no explicit sentence breaking option is chosen, sentence breaking
 * is done based on a set of language-particular sentence-ending patterns.
 * </li>
 * <LI>{@code -parseInside element} Specifies that parsing should only
 * be done for tokens inside the indicated XML-style
 * elements (done as simple pattern matching, rather than XML parsing).
 * For example, if this is specified as {@code sentence}, then
 * the text inside the {@code sentence} element
 * would be parsed.
 * Using "-parseInside s" gives you support for the input format of
 * Charniak's parser. Sentences cannot span elements. Whether the
 * contents of the element are treated as one sentence or potentially
 * multiple sentences is controlled by the {@code -sentences} flag.
 * The default is potentially multiple sentences.
 * This option gives support for extracting and parsing
 * text from very simple SGML and XML documents, and is provided as a
 * user convenience for that purpose. If you want to really parse XML
 * documents before NLP parsing them, you should use an XML parser, and then
 * call to a LexicalizedParser on appropriate CDATA.
 * <LI>{@code -tagSeparator char} Specifies to look for tags on words
 * following the word and separated from it by a special character
 * {@code char}.  For instance, many tagged corpora have the
 * representation "house/NN" and you would use {@code -tagSeparator /}.
 * Notes: This option requires that the input be pretokenized.
 * The separator has to be only a single character, and there is no
 * escaping mechanism. However, splitting is done on the <i>last</i>
 * instance of the character in the token, so that cases like
 * "3\/4/CD" are handled correctly.  The parser will in all normal
 * circumstances use the tag you provide, but will override it in the
 * case of very common words in cases where the tag that you provide
 * is not one that it regards as a possible tagging for the word.
 * The parser supports a format where only some of the words in a sentence
 * have a tag (if you are calling the parser programmatically, you indicate
 * them by having them implement the {@code HasTag} interface).
 * You can do this at the command-line by only having tags after some words,
 * but you are limited by the fact that there is no way to escape the
 * tagSeparator character.</LI>
 * <LI>{@code -maxLength leng} Specify the longest sentence that
 * will be parsed (and hence indirectly the amount of memory
 * needed for the parser). If this is not specified, the parser will
 * try to dynamically grow its parse chart when long sentence are
 * encountered, but may run out of memory trying to do so.</LI>
 * <LI>{@code -outputFormat styles} Choose the style(s) of output
 * sentences: {@code penn} for prettyprinting as in the Penn
 * treebank files, or {@code oneline} for printing sentences one
 * per line, {@code words}, {@code wordsAndTags},
 * {@code dependencies}, {@code typedDependencies},
 * or {@code typedDependenciesCollapsed}.
 * Multiple options may be specified as a comma-separated
 * list.  See TreePrint class for further documentation.</LI>
 * <LI>{@code -outputFormatOptions} Provide options that control the
 * behavior of various {@code -outputFormat} choices, such as
 * {@code lexicalize}, {@code stem}, {@code markHeadNodes},
 * or {@code xml}.  {@link edu.stanford.nlp.trees.TreePrint}
 * Options are specified as a comma-separated list.</LI>
 * <LI>{@code -writeOutputFiles} Write output files corresponding
 * to the input files, with the same name but a {@code ".stp"}
 * file extension.  The format of these files depends on the
 * {@code outputFormat} option.  (If not specified, output is sent
 * to stdout.)</LI>
 * <LI>{@code -outputFilesExtension} The extension that is appended to
 * the filename that is being parsed to produce an output file name (with the
 * -writeOutputFiles option). The default is {@code stp}.  Don't
 * include the period.
 * <LI>{@code -outputFilesDirectory} The directory in which output
 * files are written (when the -writeOutputFiles option is specified).
 * If not specified, output files are written in the same directory as the
 * input files.
 * <LI>{@code -nthreads} Parsing files and testing on treebanks
 * can use multiple threads.  This option tells the parser how many
 * threads to use.  A negative number indicates to use as many
 * threads as the machine has cores.
 * </ul>
 * See also the package documentation for more details and examples of use.
 *
 * @param args Command line arguments, as above
 */
public static void main(String[] args) {
    boolean train = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPath = null;
    Treebank testTreebank = null;
    Treebank tuneTreebank = null;
    String testPath = null;
    FileFilter testFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilter = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;
    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    // whether or not the input file has already been tokenized
    boolean tokenized = false;
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }
    Options op = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encoding = null;
    // while loop through option arguments
    while (argIndex < args.length && args[argIndex].charAt(0) == '-' && !args[argIndex].equals("-")) {
        // single - represents parse from stdin
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            train = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPath = treebankDescription.first();
            trainFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-train2")) {
            // train = true;     // cdm july 2005: should require -train for this
            Triple<String, FileFilter, Double> treebankDescription = ArgUtils.getWeightedTreebankDescription(args, argIndex, "-train2");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            secondaryTreebankPath = treebankDescription.first();
            secondaryTrainFilter = treebankDescription.second();
            secondaryTreebankWeight = treebankDescription.third();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).getDeclaredConstructor().newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (NoSuchMethodException e) {
                log.info("Method not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException | InvocationTargetException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encoding = args[argIndex + 1];
            op.tlpParams.setInputEncoding(encoding);
            op.tlpParams.setOutputEncoding(encoding);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {
            tokenized = true;
            argIndex += 1;
        } else if (args[argIndex].equalsIgnoreCase("-escaper")) {
            try {
                escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);
            } catch (Exception e) {
                log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {
            tokenizerOptions = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {
            tokenizerFactoryClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {
            tokenizerMethod = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentences")) {
            sentenceDelimiter = args[argIndex + 1];
            if (sentenceDelimiter.equalsIgnoreCase("newline")) {
                sentenceDelimiter = "\n";
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {
            elementDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {
            tagDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) {
            // load the parser from a binary serialized file
            // the next argument must be the path to the parser file
            serializedInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
            // load the parser from declarative text file
            // the next argument must be the path to the parser file
            textInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
            saveToSerializedFile = true;
            if (ArgUtils.numSubArgs(args, argIndex) < 1) {
                log.info("Missing path: -saveToSerialized filename");
            } else {
                serializedOutputFileOrUrl = args[argIndex + 1];
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
            // save the parser to declarative text file
            saveToTextFile = true;
            textOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {
            // save the training trees to a binary file
            op.trainOptions.trainTreeFile = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPath = treebankDescription.first();
            testFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tune")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            tunePath = treebankDescription.first();
            tuneFilter = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = op.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    }
    if (tuneFilter != null || tunePath != null) {
        if (tunePath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No tune treebank path specified...");
            } else {
                log.info("No tune treebank path specified.  Using train path: \"" + treebankPath + '\"');
                tunePath = treebankPath;
            }
        }
        tuneTreebank = op.tlpParams.testMemoryTreebank();
        tuneTreebank.loadPath(tunePath, tuneFilter);
    }
    if (!train && op.testOptions.verbose) {
        StringUtils.logInvocationString(log, args);
    }
    // always initialized in next if-then-else block
    LexicalizedParser lp;
    if (train) {
        StringUtils.logInvocationString(log, args);
        // so we train a parser using the treebank
        GrammarCompactor compactor = null;
        if (op.trainOptions.compactGrammar() == 3) {
            compactor = new ExactGrammarCompactor(op, false, false);
        }
        Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
        Treebank secondaryTrainTreebank = null;
        if (secondaryTreebankPath != null) {
            secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);
        }
        List<List<TaggedWord>> extraTaggedWords = null;
        if (op.trainOptions.taggedFiles != null) {
            extraTaggedWords = new ArrayList<>();
            List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles);
            for (TaggedFileRecord record : fileRecords) {
                for (List<TaggedWord> sentence : record.reader()) {
                    extraTaggedWords.add(sentence);
                }
            }
        }
        lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords);
    } else if (textInputFileOrUrl != null) {
        // so we load the parser from a text grammar file
        lp = getParserFromTextFile(textInputFileOrUrl, op);
    } else {
        // so we load a serialized parser
        if (serializedInputFileOrUrl == null && argIndex < args.length) {
            // the next argument must be the path to the serialized parser
            serializedInputFileOrUrl = args[argIndex];
            argIndex++;
        }
        if (serializedInputFileOrUrl == null) {
            log.info("No grammar specified, exiting...");
            return;
        }
        String[] extraArgs = new String[optionArgs.size()];
        extraArgs = optionArgs.toArray(extraArgs);
        try {
            lp = loadModel(serializedInputFileOrUrl, op, extraArgs);
            op = lp.op;
        } catch (IllegalArgumentException e) {
            log.info("Error loading parser, exiting...");
            throw e;
        }
    }
    // set up tokenizerFactory with options if provided
    if (tokenizerFactoryClass != null || tokenizerOptions != null) {
        try {
            if (tokenizerFactoryClass != null) {
                Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(tokenizerFactoryClass));
                Method factoryMethod;
                if (tokenizerOptions != null) {
                    factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class);
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));
                } else {
                    factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory");
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));
                }
            } else {
                // have options but no tokenizer factory.  use the parser
                // langpack's factory and set its options
                tokenizerFactory = lp.op.langpack().getTokenizerFactory();
                tokenizerFactory.setOptions(tokenizerOptions);
            }
        } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) {
            log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions);
            throw new RuntimeException(e);
        }
    }
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encoding != null) {
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
    }
    if (testFilter != null || testPath != null) {
        if (testPath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPath + '\"');
                testPath = treebankPath;
            }
        }
        testTreebank = op.tlpParams.testMemoryTreebank();
        testTreebank.loadPath(testPath, testFilter);
    }
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    // Now what do we do with the parser we've made
    if (saveToTextFile) {
        // save the parser to textGrammar format
        if (textOutputFileOrUrl != null) {
            lp.saveParserToTextFile(textOutputFileOrUrl);
        } else {
            log.info("Usage: must specify a text grammar output path");
        }
    }
    if (saveToSerializedFile) {
        if (serializedOutputFileOrUrl != null) {
            lp.saveParserToSerialized(serializedOutputFileOrUrl);
        } else if (textOutputFileOrUrl == null && testTreebank == null) {
            // no saving/parsing request has been specified
            log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");
        }
    }
    if (op.testOptions.verbose || train) {
        // Tell the user a little or a lot about what we have made
        // get lexicon size separately as it may have its own prints in it....
        String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";
        log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
        log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t' + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules);
        log.info("ParserPack is " + op.tlpParams.getClass().getName());
        log.info("Lexicon is " + lp.lex.getClass().getName());
        if (op.testOptions.verbose) {
            log.info("Tags are: " + lp.tagIndex);
        // log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)!
        }
        printOptions(false, op);
    }
    if (testTreebank != null) {
        // test parser on treebank
        EvaluateTreebank evaluator = new EvaluateTreebank(lp);
        evaluator.testOnTreebank(testTreebank);
    } else if (argIndex >= args.length) {
        // no more arguments, so we just parse our own test sentence
        PrintWriter pwOut = op.tlpParams.pw();
        PrintWriter pwErr = op.tlpParams.pw(System.err);
        ParserQuery pq = lp.parserQuery();
        if (pq.parse(op.tlpParams.defaultTestSentence())) {
            lp.getTreePrint().printTree(pq.getBestParse(), pwOut);
        } else {
            pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence());
        }
    } else {
        // We parse filenames given by the remaining arguments
        ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp);
    }
}
Also used : EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) EvaluateTreebank(edu.stanford.nlp.parser.metrics.EvaluateTreebank) TaggedFileRecord(edu.stanford.nlp.tagger.io.TaggedFileRecord) Pair(edu.stanford.nlp.util.Pair) HasWord(edu.stanford.nlp.ling.HasWord) TokenizerFactory(edu.stanford.nlp.process.TokenizerFactory) Method(java.lang.reflect.Method) InvocationTargetException(java.lang.reflect.InvocationTargetException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) InvocationTargetException(java.lang.reflect.InvocationTargetException) Triple(edu.stanford.nlp.util.Triple) TaggedWord(edu.stanford.nlp.ling.TaggedWord) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery)

Aggregations

ParserQuery (edu.stanford.nlp.parser.common.ParserQuery)12 Tree (edu.stanford.nlp.trees.Tree)6 Pair (edu.stanford.nlp.util.Pair)4 PrintWriter (java.io.PrintWriter)4 ArrayList (java.util.ArrayList)4 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 HasWord (edu.stanford.nlp.ling.HasWord)3 Timing (edu.stanford.nlp.util.Timing)3 FileOutputStream (java.io.FileOutputStream)3 IOException (java.io.IOException)3 DecimalFormat (java.text.DecimalFormat)3 NumberFormat (java.text.NumberFormat)3 Test (org.junit.Test)3 NullOutputStream (edu.stanford.nlp.io.NullOutputStream)2 Word (edu.stanford.nlp.ling.Word)2 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)2 ParsingThreadsafeProcessor (edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor)2 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)2 RerankingParserQuery (edu.stanford.nlp.parser.lexparser.RerankingParserQuery)2