use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.
the class ParseAndPrintMatrices method main.
public static void main(String[] args) throws IOException {
String modelPath = null;
String outputPath = null;
String inputPath = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = Generics.newArrayList();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-output")) {
outputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else {
unusedArgs.add(args[argIndex++]);
}
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser parser = LexicalizedParser.loadModel(modelPath, newArgs);
DVModel model = DVParser.getModelFromLexicalizedParser(parser);
File outputFile = new File(outputPath);
FileSystem.checkNotExistsOrFail(outputFile);
FileSystem.mkdirOrFail(outputFile);
int count = 0;
if (inputPath != null) {
Reader input = new BufferedReader(new FileReader(inputPath));
DocumentPreprocessor processor = new DocumentPreprocessor(input);
for (List<HasWord> sentence : processor) {
// index from 1
count++;
ParserQuery pq = parser.parserQuery();
if (!(pq instanceof RerankingParserQuery)) {
throw new IllegalArgumentException("Expected a RerankingParserQuery");
}
RerankingParserQuery rpq = (RerankingParserQuery) pq;
if (!rpq.parse(sentence)) {
throw new RuntimeException("Unparsable sentence: " + sentence);
}
RerankerQuery reranker = rpq.rerankerQuery();
if (!(reranker instanceof DVModelReranker.Query)) {
throw new IllegalArgumentException("Expected a DVModelReranker");
}
DeepTree deepTree = ((DVModelReranker.Query) reranker).getDeepTrees().get(0);
IdentityHashMap<Tree, SimpleMatrix> vectors = deepTree.getVectors();
for (Map.Entry<Tree, SimpleMatrix> entry : vectors.entrySet()) {
log.info(entry.getKey() + " " + entry.getValue());
}
FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
BufferedWriter bout = new BufferedWriter(fout);
bout.write(SentenceUtils.listToString(sentence));
bout.newLine();
bout.write(deepTree.getTree().toString());
bout.newLine();
for (HasWord word : sentence) {
outputMatrix(bout, model.getWordVector(word.word()));
}
Tree rootTree = findRootTree(vectors);
outputTreeMatrices(bout, rootTree, vectors);
bout.flush();
fout.close();
}
}
}
use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.
the class EvaluateTreebank method testOnTreebank.
/** Test the parser on a treebank. Parses will be written to stdout, and
* various other information will be written to stderr and stdout,
* particularly if <code>op.testOptions.verbose</code> is true.
*
* @param testTreebank The treebank to parse
* @return The labeled precision/recall F<sub>1</sub> (EVALB measure)
* of the parser on the treebank.
*/
public double testOnTreebank(Treebank testTreebank) {
log.info("Testing on treebank");
Timing treebankTotalTimer = new Timing();
TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
TreebankLangParserParams tlpParams = op.tlpParams;
TreebankLanguagePack tlp = op.langpack();
PrintWriter pwOut, pwErr;
if (op.testOptions.quietEvaluation) {
NullOutputStream quiet = new NullOutputStream();
pwOut = tlpParams.pw(quiet);
pwErr = tlpParams.pw(quiet);
} else {
pwOut = tlpParams.pw();
pwErr = tlpParams.pw(System.err);
}
if (op.testOptions.verbose) {
pwErr.print("Testing ");
pwErr.println(testTreebank.textualSummary(tlp));
}
if (op.testOptions.evalb) {
EvalbFormatWriter.initEVALBfiles(tlpParams);
}
PrintWriter pwFileOut = null;
if (op.testOptions.writeOutputFiles) {
String fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
try {
pwFileOut = op.tlpParams.pw(new FileOutputStream(fname));
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
PrintWriter pwStats = null;
if (op.testOptions.outputkBestEquivocation != null) {
try {
pwStats = op.tlpParams.pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
if (op.testOptions.testingThreads != 1) {
MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
LinkedList<Tree> goldTrees = new LinkedList<>();
for (Tree goldTree : testTreebank) {
List<? extends HasWord> sentence = getInputSentence(goldTree);
goldTrees.add(goldTree);
pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
wrapper.put(sentence);
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
goldTree = goldTrees.poll();
processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
}
}
// for tree iterator
wrapper.join();
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
Tree goldTree = goldTrees.poll();
processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
}
} else {
ParserQuery pq = pqFactory.parserQuery();
for (Tree goldTree : testTreebank) {
final List<CoreLabel> sentence = getInputSentence(goldTree);
pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
pq.parseAndReport(sentence, pwErr);
processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
}
// for tree iterator
}
//Done parsing...print the results of the evaluations
treebankTotalTimer.done("Testing on treebank");
if (op.testOptions.quietEvaluation) {
pwErr = tlpParams.pw(System.err);
}
if (saidMemMessage) {
ParserUtils.printOutOfMemory(pwErr);
}
if (op.testOptions.evalb) {
EvalbFormatWriter.closeEVALBfiles();
}
if (numSkippedEvals != 0) {
pwErr.printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
}
// only created here so we know what parser types are supported...
ParserQuery pq = pqFactory.parserQuery();
if (summary) {
if (pcfgLB != null)
pcfgLB.display(false, pwErr);
if (pcfgChildSpecific != null)
pcfgChildSpecific.display(false, pwErr);
if (pcfgLA != null)
pcfgLA.display(false, pwErr);
if (pcfgCB != null)
pcfgCB.display(false, pwErr);
if (pcfgDA != null)
pcfgDA.display(false, pwErr);
if (pcfgTA != null)
pcfgTA.display(false, pwErr);
if (pcfgLL != null && pq.getPCFGParser() != null)
pcfgLL.display(false, pwErr);
if (depDA != null)
depDA.display(false, pwErr);
if (depTA != null)
depTA.display(false, pwErr);
if (depLL != null && pq.getDependencyParser() != null)
depLL.display(false, pwErr);
if (factLB != null)
factLB.display(false, pwErr);
if (factChildSpecific != null)
factChildSpecific.display(false, pwErr);
if (factLA != null)
factLA.display(false, pwErr);
if (factCB != null)
factCB.display(false, pwErr);
if (factDA != null)
factDA.display(false, pwErr);
if (factTA != null)
factTA.display(false, pwErr);
if (factLL != null && pq.getFactoredParser() != null)
factLL.display(false, pwErr);
if (pcfgCatE != null)
pcfgCatE.display(false, pwErr);
for (Eval eval : evals) {
eval.display(false, pwErr);
}
for (BestOfTopKEval eval : topKEvals) {
eval.display(false, pwErr);
}
}
// these ones only have a display mode, so display if turned on!!
if (pcfgRUO != null)
pcfgRUO.display(true, pwErr);
if (pcfgCUO != null)
pcfgCUO.display(true, pwErr);
if (tsv) {
NumberFormat nf = new DecimalFormat("0.00");
pwErr.println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
if (factLB != null)
pwErr.print(nf.format(factLB.getEvalbF1Percent()));
pwErr.print("\t");
if (pq.getDependencyParser() != null && factDA != null)
pwErr.print(nf.format(factDA.getEvalbF1Percent()));
pwErr.print("\t");
if (factLB != null)
pwErr.print(nf.format(factLB.getExactPercent()));
pwErr.print("\t");
if (pcfgLB != null)
pwErr.print(nf.format(pcfgLB.getEvalbF1Percent()));
pwErr.print("\t");
if (pq.getDependencyParser() != null && depDA != null)
pwErr.print(nf.format(depDA.getEvalbF1Percent()));
pwErr.print("\t");
if (pq.getPCFGParser() != null && factTA != null)
pwErr.print(nf.format(factTA.getEvalbF1Percent()));
pwErr.print("\t");
if (factLB != null)
pwErr.print(factLB.getNum());
pwErr.println();
}
double f1 = 0.0;
if (factLB != null) {
f1 = factLB.getEvalbF1();
}
//Close files (if necessary)
if (pwFileOut != null)
pwFileOut.close();
if (pwStats != null)
pwStats.close();
if (parserQueryEvals != null) {
for (ParserQueryEval parserQueryEval : parserQueryEvals) {
parserQueryEval.display(false, pwErr);
}
}
return f1;
}
use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.
the class LexicalizedParserITest method testConstraints.
/**
* Test what happens if you put a constraint on the parse
*/
@Test
public void testConstraints() {
List<CoreLabel> sentence = sampleSausage();
ParserQuery pq = englishParser.parserQuery();
ParserConstraint constraint = new ParserConstraint(0, 2, "INTJ");
List<ParserConstraint> constraints = new ArrayList<>();
constraints.add(constraint);
pq.setConstraints(constraints);
pq.parse(sentence);
StringWriter sw = new StringWriter();
pennPrint.printTree(pq.getBestParse(), (new PrintWriter(sw)));
String actualOutput = sw.toString().replaceAll("\\s+", " ").trim();
String expectedOutput = "(ROOT (S (NP (PRP$ My) (NN dog)) (ADVP (RB also)) (VP (VBZ likes) (S (VP (VBG eating) (NP (NN sausage))))) (. .)))";
expectedOutput = expectedOutput.replaceAll("\\s+", " ").trim();
// Not exactly sure what should come back, but it shouldn't be the
// original output any more
assertFalse("Tree should not match the original tree any more", expectedOutput.equals(actualOutput));
assertTrue("Tree should be forced to contain INTJ", actualOutput.contains("INTJ"));
// System.out.println(pq.getBestParse());
}
use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.
the class ShiftReduceParserITest method testBasicConstraint.
@Test
public void testBasicConstraint() {
List<CoreLabel> sentence = SentenceUtils.toCoreLabelList("It", "was", "Carolina", "Reapers", ".");
englishTagger.tagCoreLabels(sentence);
Tree result = englishParser.apply(sentence);
// pretty much need to make the test rely on the parser being consistent
assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (NP (NNP Carolina) (NNPS Reapers))) (. .)))", result.toString());
ParserConstraint constraint = new ParserConstraint(2, 4, ".*");
List<ParserConstraint> constraints = Collections.singletonList(constraint);
ParserQuery pq = englishParser.parserQuery();
pq.setConstraints(constraints);
assertTrue(pq.parse(sentence));
result = pq.getBestParse();
assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (NP (NNP Carolina) (NNPS Reapers))) (. .)))", result.toString());
constraint = new ParserConstraint(2, 4, "NP");
constraints = Collections.singletonList(constraint);
pq = englishParser.parserQuery();
pq.setConstraints(constraints);
assertTrue(pq.parse(sentence));
result = pq.getBestParse();
assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (NP (NNP Carolina) (NNPS Reapers))) (. .)))", result.toString());
// Note that since the constraints are introducing brackets which
// don't exist, we may get some weird parse results as models
// change in the future. The important thing is that the ADJP
// bracket appears for this test and the VP bracket appears for
// the next test
constraint = new ParserConstraint(2, 4, "ADJP");
constraints = Collections.singletonList(constraint);
pq = englishParser.parserQuery();
pq.setConstraints(constraints);
assertTrue(pq.parse(sentence));
result = pq.getBestParse();
assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (ADJP (NP (NNP Carolina) (NNPS Reapers)))) (. .)))", result.toString());
constraint = new ParserConstraint(1, 3, "VP");
constraints = Collections.singletonList(constraint);
pq = englishParser.parserQuery();
pq.setConstraints(constraints);
assertTrue(pq.parse(sentence));
result = pq.getBestParse();
assertEquals("(ROOT (S (NP (PRP It)) (VP (VBD was) (ADJP (NNP Carolina))) (NP (NNPS Reapers)) (. .)))", result.toString());
}
use of edu.stanford.nlp.parser.common.ParserQuery in project CoreNLP by stanfordnlp.
the class LexicalizedParser method main.
/**
* A main program for using the parser with various options.
* This program can be used for building and serializing
* a parser from treebank data, for parsing sentences from a file
* or URL using a serialized or text grammar parser,
* and (mainly for parser quality testing)
* for training and testing a parser on a treebank all in one go.
*
* <p>
* Sample Usages:
* <ul>
* <li> <b>Train a parser (saved to <i>serializedGrammarFilename</i>)
* from a directory of trees (<i>trainFilesPath</i>, with an optional <i>fileRange</i>, e.g., 0-1000):</b>
* {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -saveToSerializedFile serializedGrammarFilename}
* </li>
*
* <li> <b>Train a parser (not saved) from a directory of trees, and test it (reporting scores) on a directory of trees</b>
* {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -testTreebank testFilePath [fileRange] }
* </li>
*
* <li> <b>Parse one or more files, given a serialized grammar and a list of files</b>
* {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] serializedGrammarPath filename [filename]*}
* </li>
*
* <li> <b>Test and report scores for a serialized grammar on trees in an output directory</b>
* {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -loadFromSerializedFile serializedGrammarPath -testTreebank testFilePath [fileRange]}
* </li>
* </ul>
*
*<p>
* If the {@code serializedGrammarPath} ends in {@code .gz},
* then the grammar is written and read as a compressed file (GZip).
* If the {@code serializedGrammarPath} is a URL, starting with
* {@code http://}, then the parser is read from the URL.
* A fileRange specifies a numeric value that must be included within a
* filename for it to be used in training or testing (this works well with
* most current treebanks). It can be specified like a range of pages to be
* printed, for instance as {@code 200-2199} or
* {@code 1-300,500-725,9000} or just as {@code 1} (if all your
* trees are in a single file, either omit this parameter or just give a dummy
* argument such as {@code 0}).
* If the filename to parse is "-" then the parser parses from stdin.
* If no files are supplied to parse, then a hardwired sentence
* is parsed.
*
* <p>
* The parser can write a grammar as either a serialized Java object file
* or in a text format (or as both), specified with the following options:
* <blockquote>{@code
* java edu.stanford.nlp.parser.lexparser.LexicalizedParser
* [-v] -train
* trainFilesPath [fileRange] [-saveToSerializedFile grammarPath]
* [-saveToTextFile grammarPath]
* }</blockquote>
*
* <p>
* In the same position as the verbose flag ({@code -v}), many other
* options can be specified. The most useful to an end user are:
* <ul>
* <LI>{@code -tLPP class} Specify a different
* TreebankLangParserParams, for when using a different language or
* treebank (the default is English Penn Treebank). <i>This option MUST occur
* before any other language-specific options that are used (or else they
* are ignored!).</i>
* (It's usually a good idea to specify this option even when loading a
* serialized grammar; it is necessary if the language pack specifies a
* needed character encoding or you wish to specify language-specific
* options on the command line.)</LI>
* <LI>{@code -encoding charset} Specify the character encoding of the
* input and output files. This will override the value in the
* {@code TreebankLangParserParams}, provided this option appears
* <i>after</i> any {@code -tLPP} option.</LI>
* <LI>{@code -tokenized} Says that the input is already separated
* into whitespace-delimited tokens. If this option is specified, any
* tokenizer specified for the language is ignored, and a universal (Unicode)
* tokenizer, which divides only on whitespace, is used.
* Unless you also specify
* {@code -escaper}, the tokens <i>must</i> all be correctly
* tokenized tokens of the appropriate treebank for the parser to work
* well (for instance, if using the Penn English Treebank, you must have
* coded "(" as "-LRB-", etc.). (Note: we do not use the backslash escaping
* in front of / and * that appeared in Penn Treebank releases through 1999.)</li>
* <li>{@code -escaper class} Specify a class of type
* {@link Function}<List<HasWord>,List<HasWord>> to do
* customized escaping of tokenized text. This class will be run over the
* tokenized text and can fix the representation of tokens. For instance,
* it could change "(" to "-LRB-" for the Penn English Treebank. A
* provided escaper that does such things for the Penn English Treebank is
* {@code edu.stanford.nlp.process.PTBEscapingProcessor}
* <li>{@code -tokenizerFactory class} Specifies a
* TokenizerFactory class to be used for tokenization</li>
* <li>{@code -tokenizerOptions options} Specifies options to a
* TokenizerFactory class to be used for tokenization. A comma-separated
* list. For PTBTokenizer, options of interest include
* {@code americanize=false} and {@code quotes=ascii} (for German).
* Note that any choice of tokenizer options that conflicts with the
* tokenization used in the parser training data will likely degrade parser
* performance. </li>
* <li>{@code -sentences token } Specifies a token that marks sentence
* boundaries. A value of {@code newline} causes sentence breaking on
* newlines. A value of {@code onePerElement} causes each element
* (using the XML {@code -parseInside} option) to be treated as a
* sentence. All other tokens will be interpreted literally, and must be
* exactly the same as tokens returned by the tokenizer. For example,
* you might specify "|||" and put that symbol sequence as a token between
* sentences.
* If no explicit sentence breaking option is chosen, sentence breaking
* is done based on a set of language-particular sentence-ending patterns.
* </li>
* <LI>{@code -parseInside element} Specifies that parsing should only
* be done for tokens inside the indicated XML-style
* elements (done as simple pattern matching, rather than XML parsing).
* For example, if this is specified as {@code sentence}, then
* the text inside the {@code sentence} element
* would be parsed.
* Using "-parseInside s" gives you support for the input format of
* Charniak's parser. Sentences cannot span elements. Whether the
* contents of the element are treated as one sentence or potentially
* multiple sentences is controlled by the {@code -sentences} flag.
* The default is potentially multiple sentences.
* This option gives support for extracting and parsing
* text from very simple SGML and XML documents, and is provided as a
* user convenience for that purpose. If you want to really parse XML
* documents before NLP parsing them, you should use an XML parser, and then
* call to a LexicalizedParser on appropriate CDATA.
* <LI>{@code -tagSeparator char} Specifies to look for tags on words
* following the word and separated from it by a special character
* {@code char}. For instance, many tagged corpora have the
* representation "house/NN" and you would use {@code -tagSeparator /}.
* Notes: This option requires that the input be pretokenized.
* The separator has to be only a single character, and there is no
* escaping mechanism. However, splitting is done on the <i>last</i>
* instance of the character in the token, so that cases like
* "3\/4/CD" are handled correctly. The parser will in all normal
* circumstances use the tag you provide, but will override it in the
* case of very common words in cases where the tag that you provide
* is not one that it regards as a possible tagging for the word.
* The parser supports a format where only some of the words in a sentence
* have a tag (if you are calling the parser programmatically, you indicate
* them by having them implement the {@code HasTag} interface).
* You can do this at the command-line by only having tags after some words,
* but you are limited by the fact that there is no way to escape the
* tagSeparator character.</LI>
* <LI>{@code -maxLength leng} Specify the longest sentence that
* will be parsed (and hence indirectly the amount of memory
* needed for the parser). If this is not specified, the parser will
* try to dynamically grow its parse chart when long sentence are
* encountered, but may run out of memory trying to do so.</LI>
* <LI>{@code -outputFormat styles} Choose the style(s) of output
* sentences: {@code penn} for prettyprinting as in the Penn
* treebank files, or {@code oneline} for printing sentences one
* per line, {@code words}, {@code wordsAndTags},
* {@code dependencies}, {@code typedDependencies},
* or {@code typedDependenciesCollapsed}.
* Multiple options may be specified as a comma-separated
* list. See TreePrint class for further documentation.</LI>
* <LI>{@code -outputFormatOptions} Provide options that control the
* behavior of various {@code -outputFormat} choices, such as
* {@code lexicalize}, {@code stem}, {@code markHeadNodes},
* or {@code xml}. {@link edu.stanford.nlp.trees.TreePrint}
* Options are specified as a comma-separated list.</LI>
* <LI>{@code -writeOutputFiles} Write output files corresponding
* to the input files, with the same name but a {@code ".stp"}
* file extension. The format of these files depends on the
* {@code outputFormat} option. (If not specified, output is sent
* to stdout.)</LI>
* <LI>{@code -outputFilesExtension} The extension that is appended to
* the filename that is being parsed to produce an output file name (with the
* -writeOutputFiles option). The default is {@code stp}. Don't
* include the period.
* <LI>{@code -outputFilesDirectory} The directory in which output
* files are written (when the -writeOutputFiles option is specified).
* If not specified, output files are written in the same directory as the
* input files.
* <LI>{@code -nthreads} Parsing files and testing on treebanks
* can use multiple threads. This option tells the parser how many
* threads to use. A negative number indicates to use as many
* threads as the machine has cores.
* </ul>
* See also the package documentation for more details and examples of use.
*
* @param args Command line arguments, as above
*/
public static void main(String[] args) {
boolean train = false;
boolean saveToSerializedFile = false;
boolean saveToTextFile = false;
String serializedInputFileOrUrl = null;
String textInputFileOrUrl = null;
String serializedOutputFileOrUrl = null;
String textOutputFileOrUrl = null;
String treebankPath = null;
Treebank testTreebank = null;
Treebank tuneTreebank = null;
String testPath = null;
FileFilter testFilter = null;
String tunePath = null;
FileFilter tuneFilter = null;
FileFilter trainFilter = null;
String secondaryTreebankPath = null;
double secondaryTreebankWeight = 1.0;
FileFilter secondaryTrainFilter = null;
// variables needed to process the files to be parsed
TokenizerFactory<? extends HasWord> tokenizerFactory = null;
String tokenizerOptions = null;
String tokenizerFactoryClass = null;
String tokenizerMethod = null;
// whether or not the input file has already been tokenized
boolean tokenized = false;
Function<List<HasWord>, List<HasWord>> escaper = null;
String tagDelimiter = null;
String sentenceDelimiter = null;
String elementDelimiter = null;
int argIndex = 0;
if (args.length < 1) {
log.info("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
return;
}
Options op = new Options();
List<String> optionArgs = new ArrayList<>();
String encoding = null;
// while loop through option arguments
while (argIndex < args.length && args[argIndex].charAt(0) == '-' && !args[argIndex].equals("-")) {
// single - represents parse from stdin
if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
train = true;
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
treebankPath = treebankDescription.first();
trainFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-train2")) {
// train = true; // cdm july 2005: should require -train for this
Triple<String, FileFilter, Double> treebankDescription = ArgUtils.getWeightedTreebankDescription(args, argIndex, "-train2");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
secondaryTreebankPath = treebankDescription.first();
secondaryTrainFilter = treebankDescription.second();
secondaryTreebankWeight = treebankDescription.third();
} else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
try {
op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).getDeclaredConstructor().newInstance();
} catch (ClassNotFoundException e) {
log.info("Class not found: " + args[argIndex + 1]);
throw new RuntimeException(e);
} catch (NoSuchMethodException e) {
log.info("Method not found: " + args[argIndex + 1]);
throw new RuntimeException(e);
} catch (InstantiationException | InvocationTargetException e) {
log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
log.info("Illegal access" + e);
throw new RuntimeException(e);
}
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-encoding")) {
// sets encoding for TreebankLangParserParams
// redone later to override any serialized parser one read in
encoding = args[argIndex + 1];
op.tlpParams.setInputEncoding(encoding);
op.tlpParams.setOutputEncoding(encoding);
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tokenized")) {
tokenized = true;
argIndex += 1;
} else if (args[argIndex].equalsIgnoreCase("-escaper")) {
try {
escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);
} catch (Exception e) {
log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);
}
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {
tokenizerOptions = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {
tokenizerFactoryClass = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {
tokenizerMethod = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentences")) {
sentenceDelimiter = args[argIndex + 1];
if (sentenceDelimiter.equalsIgnoreCase("newline")) {
sentenceDelimiter = "\n";
}
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parseInside")) {
elementDelimiter = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {
tagDelimiter = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) {
// load the parser from a binary serialized file
// the next argument must be the path to the parser file
serializedInputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
// load the parser from declarative text file
// the next argument must be the path to the parser file
textInputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
saveToSerializedFile = true;
if (ArgUtils.numSubArgs(args, argIndex) < 1) {
log.info("Missing path: -saveToSerialized filename");
} else {
serializedOutputFileOrUrl = args[argIndex + 1];
}
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
// save the parser to declarative text file
saveToTextFile = true;
textOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {
// save the training trees to a binary file
op.trainOptions.trainTreeFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testPath = treebankDescription.first();
testFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-tune")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
tunePath = treebankDescription.first();
tuneFilter = treebankDescription.second();
} else {
int oldIndex = argIndex;
argIndex = op.setOptionOrWarn(args, argIndex);
optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
}
}
if (tuneFilter != null || tunePath != null) {
if (tunePath == null) {
if (treebankPath == null) {
throw new RuntimeException("No tune treebank path specified...");
} else {
log.info("No tune treebank path specified. Using train path: \"" + treebankPath + '\"');
tunePath = treebankPath;
}
}
tuneTreebank = op.tlpParams.testMemoryTreebank();
tuneTreebank.loadPath(tunePath, tuneFilter);
}
if (!train && op.testOptions.verbose) {
StringUtils.logInvocationString(log, args);
}
// always initialized in next if-then-else block
LexicalizedParser lp;
if (train) {
StringUtils.logInvocationString(log, args);
// so we train a parser using the treebank
GrammarCompactor compactor = null;
if (op.trainOptions.compactGrammar() == 3) {
compactor = new ExactGrammarCompactor(op, false, false);
}
Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
Treebank secondaryTrainTreebank = null;
if (secondaryTreebankPath != null) {
secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);
}
List<List<TaggedWord>> extraTaggedWords = null;
if (op.trainOptions.taggedFiles != null) {
extraTaggedWords = new ArrayList<>();
List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles);
for (TaggedFileRecord record : fileRecords) {
for (List<TaggedWord> sentence : record.reader()) {
extraTaggedWords.add(sentence);
}
}
}
lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords);
} else if (textInputFileOrUrl != null) {
// so we load the parser from a text grammar file
lp = getParserFromTextFile(textInputFileOrUrl, op);
} else {
// so we load a serialized parser
if (serializedInputFileOrUrl == null && argIndex < args.length) {
// the next argument must be the path to the serialized parser
serializedInputFileOrUrl = args[argIndex];
argIndex++;
}
if (serializedInputFileOrUrl == null) {
log.info("No grammar specified, exiting...");
return;
}
String[] extraArgs = new String[optionArgs.size()];
extraArgs = optionArgs.toArray(extraArgs);
try {
lp = loadModel(serializedInputFileOrUrl, op, extraArgs);
op = lp.op;
} catch (IllegalArgumentException e) {
log.info("Error loading parser, exiting...");
throw e;
}
}
// set up tokenizerFactory with options if provided
if (tokenizerFactoryClass != null || tokenizerOptions != null) {
try {
if (tokenizerFactoryClass != null) {
Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(tokenizerFactoryClass));
Method factoryMethod;
if (tokenizerOptions != null) {
factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class);
tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));
} else {
factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory");
tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));
}
} else {
// have options but no tokenizer factory. use the parser
// langpack's factory and set its options
tokenizerFactory = lp.op.langpack().getTokenizerFactory();
tokenizerFactory.setOptions(tokenizerOptions);
}
} catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) {
log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions);
throw new RuntimeException(e);
}
}
// OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
if (encoding != null) {
op.tlpParams.setInputEncoding(encoding);
op.tlpParams.setOutputEncoding(encoding);
}
if (testFilter != null || testPath != null) {
if (testPath == null) {
if (treebankPath == null) {
throw new RuntimeException("No test treebank path specified...");
} else {
log.info("No test treebank path specified. Using train path: \"" + treebankPath + '\"');
testPath = treebankPath;
}
}
testTreebank = op.tlpParams.testMemoryTreebank();
testTreebank.loadPath(testPath, testFilter);
}
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
// Now what do we do with the parser we've made
if (saveToTextFile) {
// save the parser to textGrammar format
if (textOutputFileOrUrl != null) {
lp.saveParserToTextFile(textOutputFileOrUrl);
} else {
log.info("Usage: must specify a text grammar output path");
}
}
if (saveToSerializedFile) {
if (serializedOutputFileOrUrl != null) {
lp.saveParserToSerialized(serializedOutputFileOrUrl);
} else if (textOutputFileOrUrl == null && testTreebank == null) {
// no saving/parsing request has been specified
log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");
}
}
if (op.testOptions.verbose || train) {
// Tell the user a little or a lot about what we have made
// get lexicon size separately as it may have its own prints in it....
String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";
log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t' + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules);
log.info("ParserPack is " + op.tlpParams.getClass().getName());
log.info("Lexicon is " + lp.lex.getClass().getName());
if (op.testOptions.verbose) {
log.info("Tags are: " + lp.tagIndex);
// log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)!
}
printOptions(false, op);
}
if (testTreebank != null) {
// test parser on treebank
EvaluateTreebank evaluator = new EvaluateTreebank(lp);
evaluator.testOnTreebank(testTreebank);
} else if (argIndex >= args.length) {
// no more arguments, so we just parse our own test sentence
PrintWriter pwOut = op.tlpParams.pw();
PrintWriter pwErr = op.tlpParams.pw(System.err);
ParserQuery pq = lp.parserQuery();
if (pq.parse(op.tlpParams.defaultTestSentence())) {
lp.getTreePrint().printTree(pq.getBestParse(), pwOut);
} else {
pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence());
}
} else {
// We parse filenames given by the remaining arguments
ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp);
}
}
Aggregations