use of edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor in project CoreNLP by stanfordnlp.
the class ParseFiles method parseFiles.
public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
if (op.testOptions.verbose) {
if (tokenizerFactory != null)
pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
}
final Timing timer = new Timing();
//Loop over the files
for (int i = argIndex; i < args.length; i++) {
final String filename = args[i];
final DocumentPreprocessor documentPreprocessor;
if (filename.equals("-")) {
try {
documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
} else {
documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
}
//Unused values are null per the main() method invocation below
//null is the default for these properties
documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
documentPreprocessor.setEscaper(escaper);
documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
documentPreprocessor.setTagDelimiter(tagDelimiter);
documentPreprocessor.setElementDelimiter(elementDelimiter);
if (tokenizerFactory == null)
documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
else
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
//Setup the output
PrintWriter pwo = pwOut;
if (op.testOptions.writeOutputFiles) {
String normalizedName = filename;
try {
// this will exception if not a URL
new URL(normalizedName);
normalizedName = normalizedName.replaceAll("/", "_");
} catch (MalformedURLException e) {
//It isn't a URL, so silently ignore
}
String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
String fname = normalizedName + '.' + ext;
if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
String fseparator = System.getProperty("file.separator");
if (fseparator == null || fseparator.isEmpty()) {
fseparator = "/";
}
File fnameFile = new File(fname);
fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
}
try {
pwo = op.tlpParams.pw(new FileOutputStream(fname));
} catch (IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
pwErr.println("Parsing file: " + filename);
int num = 0;
int numProcessed = 0;
if (op.testOptions.testingThreads != 1) {
MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
for (List<HasWord> sentence : documentPreprocessor) {
num++;
numSents++;
int len = sentence.size();
numWords += len;
pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
wrapper.put(sentence);
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
processResults(pq, numProcessed++, pwo);
}
}
wrapper.join();
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
processResults(pq, numProcessed++, pwo);
}
} else {
ParserQuery pq = pqFactory.parserQuery();
for (List<HasWord> sentence : documentPreprocessor) {
num++;
numSents++;
int len = sentence.size();
numWords += len;
pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
pq.parseAndReport(sentence, pwErr);
processResults(pq, numProcessed++, pwo);
}
}
treePrint.printFooter(pwo);
if (op.testOptions.writeOutputFiles)
pwo.close();
pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
}
long millis = timer.stop();
if (summary) {
if (pcfgLL != null)
pcfgLL.display(false, pwErr);
if (depLL != null)
depLL.display(false, pwErr);
if (factLL != null)
factLL.display(false, pwErr);
}
if (saidMemMessage) {
ParserUtils.printOutOfMemory(pwErr);
}
double wordspersec = numWords / (((double) millis) / 1000);
double sentspersec = numSents / (((double) millis) / 1000);
// easier way!
NumberFormat nf = new DecimalFormat("0.00");
pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
if (numFallback > 0) {
pwErr.println(" " + numFallback + " sentences were parsed by fallback to PCFG.");
}
if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
pwErr.println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
if (numUnparsable > 0) {
pwErr.println(" " + numUnparsable + " were not parsable with non-zero probability.");
}
if (numNoMemory > 0) {
pwErr.println(" " + numNoMemory + " were skipped because of insufficient memory.");
}
if (numSkipped > 0) {
pwErr.println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
}
}
}
use of edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor in project CoreNLP by stanfordnlp.
the class EvaluateTreebank method testOnTreebank.
/** Test the parser on a treebank. Parses will be written to stdout, and
* various other information will be written to stderr and stdout,
* particularly if <code>op.testOptions.verbose</code> is true.
*
* @param testTreebank The treebank to parse
* @return The labeled precision/recall F<sub>1</sub> (EVALB measure)
* of the parser on the treebank.
*/
public double testOnTreebank(Treebank testTreebank) {
log.info("Testing on treebank");
Timing treebankTotalTimer = new Timing();
TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
TreebankLangParserParams tlpParams = op.tlpParams;
TreebankLanguagePack tlp = op.langpack();
PrintWriter pwOut, pwErr;
if (op.testOptions.quietEvaluation) {
NullOutputStream quiet = new NullOutputStream();
pwOut = tlpParams.pw(quiet);
pwErr = tlpParams.pw(quiet);
} else {
pwOut = tlpParams.pw();
pwErr = tlpParams.pw(System.err);
}
if (op.testOptions.verbose) {
pwErr.print("Testing ");
pwErr.println(testTreebank.textualSummary(tlp));
}
if (op.testOptions.evalb) {
EvalbFormatWriter.initEVALBfiles(tlpParams);
}
PrintWriter pwFileOut = null;
if (op.testOptions.writeOutputFiles) {
String fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
try {
pwFileOut = op.tlpParams.pw(new FileOutputStream(fname));
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
PrintWriter pwStats = null;
if (op.testOptions.outputkBestEquivocation != null) {
try {
pwStats = op.tlpParams.pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
if (op.testOptions.testingThreads != 1) {
MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
LinkedList<Tree> goldTrees = new LinkedList<>();
for (Tree goldTree : testTreebank) {
List<? extends HasWord> sentence = getInputSentence(goldTree);
goldTrees.add(goldTree);
pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
wrapper.put(sentence);
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
goldTree = goldTrees.poll();
processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
}
}
// for tree iterator
wrapper.join();
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
Tree goldTree = goldTrees.poll();
processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
}
} else {
ParserQuery pq = pqFactory.parserQuery();
for (Tree goldTree : testTreebank) {
final List<CoreLabel> sentence = getInputSentence(goldTree);
pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
pq.parseAndReport(sentence, pwErr);
processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
}
// for tree iterator
}
//Done parsing...print the results of the evaluations
treebankTotalTimer.done("Testing on treebank");
if (op.testOptions.quietEvaluation) {
pwErr = tlpParams.pw(System.err);
}
if (saidMemMessage) {
ParserUtils.printOutOfMemory(pwErr);
}
if (op.testOptions.evalb) {
EvalbFormatWriter.closeEVALBfiles();
}
if (numSkippedEvals != 0) {
pwErr.printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
}
// only created here so we know what parser types are supported...
ParserQuery pq = pqFactory.parserQuery();
if (summary) {
if (pcfgLB != null)
pcfgLB.display(false, pwErr);
if (pcfgChildSpecific != null)
pcfgChildSpecific.display(false, pwErr);
if (pcfgLA != null)
pcfgLA.display(false, pwErr);
if (pcfgCB != null)
pcfgCB.display(false, pwErr);
if (pcfgDA != null)
pcfgDA.display(false, pwErr);
if (pcfgTA != null)
pcfgTA.display(false, pwErr);
if (pcfgLL != null && pq.getPCFGParser() != null)
pcfgLL.display(false, pwErr);
if (depDA != null)
depDA.display(false, pwErr);
if (depTA != null)
depTA.display(false, pwErr);
if (depLL != null && pq.getDependencyParser() != null)
depLL.display(false, pwErr);
if (factLB != null)
factLB.display(false, pwErr);
if (factChildSpecific != null)
factChildSpecific.display(false, pwErr);
if (factLA != null)
factLA.display(false, pwErr);
if (factCB != null)
factCB.display(false, pwErr);
if (factDA != null)
factDA.display(false, pwErr);
if (factTA != null)
factTA.display(false, pwErr);
if (factLL != null && pq.getFactoredParser() != null)
factLL.display(false, pwErr);
if (pcfgCatE != null)
pcfgCatE.display(false, pwErr);
for (Eval eval : evals) {
eval.display(false, pwErr);
}
for (BestOfTopKEval eval : topKEvals) {
eval.display(false, pwErr);
}
}
// these ones only have a display mode, so display if turned on!!
if (pcfgRUO != null)
pcfgRUO.display(true, pwErr);
if (pcfgCUO != null)
pcfgCUO.display(true, pwErr);
if (tsv) {
NumberFormat nf = new DecimalFormat("0.00");
pwErr.println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
if (factLB != null)
pwErr.print(nf.format(factLB.getEvalbF1Percent()));
pwErr.print("\t");
if (pq.getDependencyParser() != null && factDA != null)
pwErr.print(nf.format(factDA.getEvalbF1Percent()));
pwErr.print("\t");
if (factLB != null)
pwErr.print(nf.format(factLB.getExactPercent()));
pwErr.print("\t");
if (pcfgLB != null)
pwErr.print(nf.format(pcfgLB.getEvalbF1Percent()));
pwErr.print("\t");
if (pq.getDependencyParser() != null && depDA != null)
pwErr.print(nf.format(depDA.getEvalbF1Percent()));
pwErr.print("\t");
if (pq.getPCFGParser() != null && factTA != null)
pwErr.print(nf.format(factTA.getEvalbF1Percent()));
pwErr.print("\t");
if (factLB != null)
pwErr.print(factLB.getNum());
pwErr.println();
}
double f1 = 0.0;
if (factLB != null) {
f1 = factLB.getEvalbF1();
}
//Close files (if necessary)
if (pwFileOut != null)
pwFileOut.close();
if (pwStats != null)
pwStats.close();
if (parserQueryEvals != null) {
for (ParserQueryEval parserQueryEval : parserQueryEvals) {
parserQueryEval.display(false, pwErr);
}
}
return f1;
}
Aggregations