Search in sources :

Example 1 with ParsingThreadsafeProcessor

use of edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor in project CoreNLP by stanfordnlp.

the class ParseFiles method parseFiles.

public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
    final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
    if (op.testOptions.verbose) {
        if (tokenizerFactory != null)
            pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
    }
    final Timing timer = new Timing();
    //Loop over the files
    for (int i = argIndex; i < args.length; i++) {
        final String filename = args[i];
        final DocumentPreprocessor documentPreprocessor;
        if (filename.equals("-")) {
            try {
                documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        } else {
            documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
        }
        //Unused values are null per the main() method invocation below
        //null is the default for these properties
        documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
        documentPreprocessor.setEscaper(escaper);
        documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
        documentPreprocessor.setTagDelimiter(tagDelimiter);
        documentPreprocessor.setElementDelimiter(elementDelimiter);
        if (tokenizerFactory == null)
            documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
        else
            documentPreprocessor.setTokenizerFactory(tokenizerFactory);
        //Setup the output
        PrintWriter pwo = pwOut;
        if (op.testOptions.writeOutputFiles) {
            String normalizedName = filename;
            try {
                // this will exception if not a URL
                new URL(normalizedName);
                normalizedName = normalizedName.replaceAll("/", "_");
            } catch (MalformedURLException e) {
            //It isn't a URL, so silently ignore
            }
            String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
            String fname = normalizedName + '.' + ext;
            if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
                String fseparator = System.getProperty("file.separator");
                if (fseparator == null || fseparator.isEmpty()) {
                    fseparator = "/";
                }
                File fnameFile = new File(fname);
                fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
            }
            try {
                pwo = op.tlpParams.pw(new FileOutputStream(fname));
            } catch (IOException ioe) {
                throw new RuntimeIOException(ioe);
            }
        }
        treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
        pwErr.println("Parsing file: " + filename);
        int num = 0;
        int numProcessed = 0;
        if (op.testOptions.testingThreads != 1) {
            MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
            for (List<HasWord> sentence : documentPreprocessor) {
                num++;
                numSents++;
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                wrapper.put(sentence);
                while (wrapper.peek()) {
                    ParserQuery pq = wrapper.poll();
                    processResults(pq, numProcessed++, pwo);
                }
            }
            wrapper.join();
            while (wrapper.peek()) {
                ParserQuery pq = wrapper.poll();
                processResults(pq, numProcessed++, pwo);
            }
        } else {
            ParserQuery pq = pqFactory.parserQuery();
            for (List<HasWord> sentence : documentPreprocessor) {
                num++;
                numSents++;
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                pq.parseAndReport(sentence, pwErr);
                processResults(pq, numProcessed++, pwo);
            }
        }
        treePrint.printFooter(pwo);
        if (op.testOptions.writeOutputFiles)
            pwo.close();
        pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
    }
    long millis = timer.stop();
    if (summary) {
        if (pcfgLL != null)
            pcfgLL.display(false, pwErr);
        if (depLL != null)
            depLL.display(false, pwErr);
        if (factLL != null)
            factLL.display(false, pwErr);
    }
    if (saidMemMessage) {
        ParserUtils.printOutOfMemory(pwErr);
    }
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    // easier way!
    NumberFormat nf = new DecimalFormat("0.00");
    pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
        }
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) MalformedURLException(java.net.MalformedURLException) MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) DecimalFormat(java.text.DecimalFormat) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) URL(java.net.URL) ParsingThreadsafeProcessor(edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor) FileOutputStream(java.io.FileOutputStream) List(java.util.List) Timing(edu.stanford.nlp.util.Timing) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) File(java.io.File) DocType(edu.stanford.nlp.process.DocumentPreprocessor.DocType) PrintWriter(java.io.PrintWriter) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) NumberFormat(java.text.NumberFormat)

Example 2 with ParsingThreadsafeProcessor

use of edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor in project CoreNLP by stanfordnlp.

the class EvaluateTreebank method testOnTreebank.

/** Test the parser on a treebank. Parses will be written to stdout, and
   *  various other information will be written to stderr and stdout,
   *  particularly if <code>op.testOptions.verbose</code> is true.
   *
   *  @param testTreebank The treebank to parse
   *  @return The labeled precision/recall F<sub>1</sub> (EVALB measure)
   *          of the parser on the treebank.
   */
public double testOnTreebank(Treebank testTreebank) {
    log.info("Testing on treebank");
    Timing treebankTotalTimer = new Timing();
    TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = op.langpack();
    PrintWriter pwOut, pwErr;
    if (op.testOptions.quietEvaluation) {
        NullOutputStream quiet = new NullOutputStream();
        pwOut = tlpParams.pw(quiet);
        pwErr = tlpParams.pw(quiet);
    } else {
        pwOut = tlpParams.pw();
        pwErr = tlpParams.pw(System.err);
    }
    if (op.testOptions.verbose) {
        pwErr.print("Testing ");
        pwErr.println(testTreebank.textualSummary(tlp));
    }
    if (op.testOptions.evalb) {
        EvalbFormatWriter.initEVALBfiles(tlpParams);
    }
    PrintWriter pwFileOut = null;
    if (op.testOptions.writeOutputFiles) {
        String fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
        try {
            pwFileOut = op.tlpParams.pw(new FileOutputStream(fname));
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
    PrintWriter pwStats = null;
    if (op.testOptions.outputkBestEquivocation != null) {
        try {
            pwStats = op.tlpParams.pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
    if (op.testOptions.testingThreads != 1) {
        MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
        LinkedList<Tree> goldTrees = new LinkedList<>();
        for (Tree goldTree : testTreebank) {
            List<? extends HasWord> sentence = getInputSentence(goldTree);
            goldTrees.add(goldTree);
            pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
            wrapper.put(sentence);
            while (wrapper.peek()) {
                ParserQuery pq = wrapper.poll();
                goldTree = goldTrees.poll();
                processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
            }
        }
        // for tree iterator
        wrapper.join();
        while (wrapper.peek()) {
            ParserQuery pq = wrapper.poll();
            Tree goldTree = goldTrees.poll();
            processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
        }
    } else {
        ParserQuery pq = pqFactory.parserQuery();
        for (Tree goldTree : testTreebank) {
            final List<CoreLabel> sentence = getInputSentence(goldTree);
            pwErr.println("Parsing [len. " + sentence.size() + "]: " + SentenceUtils.listToString(sentence));
            pq.parseAndReport(sentence, pwErr);
            processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
        }
    // for tree iterator
    }
    //Done parsing...print the results of the evaluations
    treebankTotalTimer.done("Testing on treebank");
    if (op.testOptions.quietEvaluation) {
        pwErr = tlpParams.pw(System.err);
    }
    if (saidMemMessage) {
        ParserUtils.printOutOfMemory(pwErr);
    }
    if (op.testOptions.evalb) {
        EvalbFormatWriter.closeEVALBfiles();
    }
    if (numSkippedEvals != 0) {
        pwErr.printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
    }
    // only created here so we know what parser types are supported...
    ParserQuery pq = pqFactory.parserQuery();
    if (summary) {
        if (pcfgLB != null)
            pcfgLB.display(false, pwErr);
        if (pcfgChildSpecific != null)
            pcfgChildSpecific.display(false, pwErr);
        if (pcfgLA != null)
            pcfgLA.display(false, pwErr);
        if (pcfgCB != null)
            pcfgCB.display(false, pwErr);
        if (pcfgDA != null)
            pcfgDA.display(false, pwErr);
        if (pcfgTA != null)
            pcfgTA.display(false, pwErr);
        if (pcfgLL != null && pq.getPCFGParser() != null)
            pcfgLL.display(false, pwErr);
        if (depDA != null)
            depDA.display(false, pwErr);
        if (depTA != null)
            depTA.display(false, pwErr);
        if (depLL != null && pq.getDependencyParser() != null)
            depLL.display(false, pwErr);
        if (factLB != null)
            factLB.display(false, pwErr);
        if (factChildSpecific != null)
            factChildSpecific.display(false, pwErr);
        if (factLA != null)
            factLA.display(false, pwErr);
        if (factCB != null)
            factCB.display(false, pwErr);
        if (factDA != null)
            factDA.display(false, pwErr);
        if (factTA != null)
            factTA.display(false, pwErr);
        if (factLL != null && pq.getFactoredParser() != null)
            factLL.display(false, pwErr);
        if (pcfgCatE != null)
            pcfgCatE.display(false, pwErr);
        for (Eval eval : evals) {
            eval.display(false, pwErr);
        }
        for (BestOfTopKEval eval : topKEvals) {
            eval.display(false, pwErr);
        }
    }
    // these ones only have a display mode, so display if turned on!!
    if (pcfgRUO != null)
        pcfgRUO.display(true, pwErr);
    if (pcfgCUO != null)
        pcfgCUO.display(true, pwErr);
    if (tsv) {
        NumberFormat nf = new DecimalFormat("0.00");
        pwErr.println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
        if (factLB != null)
            pwErr.print(nf.format(factLB.getEvalbF1Percent()));
        pwErr.print("\t");
        if (pq.getDependencyParser() != null && factDA != null)
            pwErr.print(nf.format(factDA.getEvalbF1Percent()));
        pwErr.print("\t");
        if (factLB != null)
            pwErr.print(nf.format(factLB.getExactPercent()));
        pwErr.print("\t");
        if (pcfgLB != null)
            pwErr.print(nf.format(pcfgLB.getEvalbF1Percent()));
        pwErr.print("\t");
        if (pq.getDependencyParser() != null && depDA != null)
            pwErr.print(nf.format(depDA.getEvalbF1Percent()));
        pwErr.print("\t");
        if (pq.getPCFGParser() != null && factTA != null)
            pwErr.print(nf.format(factTA.getEvalbF1Percent()));
        pwErr.print("\t");
        if (factLB != null)
            pwErr.print(factLB.getNum());
        pwErr.println();
    }
    double f1 = 0.0;
    if (factLB != null) {
        f1 = factLB.getEvalbF1();
    }
    //Close files (if necessary)
    if (pwFileOut != null)
        pwFileOut.close();
    if (pwStats != null)
        pwStats.close();
    if (parserQueryEvals != null) {
        for (ParserQueryEval parserQueryEval : parserQueryEvals) {
            parserQueryEval.display(false, pwErr);
        }
    }
    return f1;
}
Also used : DecimalFormat(java.text.DecimalFormat) TreePrint(edu.stanford.nlp.trees.TreePrint) Tree(edu.stanford.nlp.trees.Tree) TreebankLanguagePack(edu.stanford.nlp.trees.TreebankLanguagePack) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) LeafAncestorEval(edu.stanford.nlp.parser.metrics.LeafAncestorEval) AbstractEval(edu.stanford.nlp.parser.metrics.AbstractEval) TaggingEval(edu.stanford.nlp.parser.metrics.TaggingEval) TopMatchEval(edu.stanford.nlp.parser.metrics.TopMatchEval) FilteredEval(edu.stanford.nlp.parser.metrics.FilteredEval) Eval(edu.stanford.nlp.parser.metrics.Eval) UnlabeledAttachmentEval(edu.stanford.nlp.parser.metrics.UnlabeledAttachmentEval) BestOfTopKEval(edu.stanford.nlp.parser.metrics.BestOfTopKEval) ParserQueryEval(edu.stanford.nlp.parser.metrics.ParserQueryEval) PrintWriter(java.io.PrintWriter) MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) IOException(java.io.IOException) ParserQueryEval(edu.stanford.nlp.parser.metrics.ParserQueryEval) LinkedList(java.util.LinkedList) ParsingThreadsafeProcessor(edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor) FileOutputStream(java.io.FileOutputStream) Timing(edu.stanford.nlp.util.Timing) BestOfTopKEval(edu.stanford.nlp.parser.metrics.BestOfTopKEval) NullOutputStream(edu.stanford.nlp.io.NullOutputStream) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) NumberFormat(java.text.NumberFormat)

Aggregations

ParserQuery (edu.stanford.nlp.parser.common.ParserQuery)2 ParsingThreadsafeProcessor (edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor)2 Timing (edu.stanford.nlp.util.Timing)2 MulticoreWrapper (edu.stanford.nlp.util.concurrent.MulticoreWrapper)2 FileOutputStream (java.io.FileOutputStream)2 IOException (java.io.IOException)2 PrintWriter (java.io.PrintWriter)2 DecimalFormat (java.text.DecimalFormat)2 NumberFormat (java.text.NumberFormat)2 List (java.util.List)2 NullOutputStream (edu.stanford.nlp.io.NullOutputStream)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 HasWord (edu.stanford.nlp.ling.HasWord)1 AbstractEval (edu.stanford.nlp.parser.metrics.AbstractEval)1 BestOfTopKEval (edu.stanford.nlp.parser.metrics.BestOfTopKEval)1 Eval (edu.stanford.nlp.parser.metrics.Eval)1 FilteredEval (edu.stanford.nlp.parser.metrics.FilteredEval)1 LeafAncestorEval (edu.stanford.nlp.parser.metrics.LeafAncestorEval)1 ParserQueryEval (edu.stanford.nlp.parser.metrics.ParserQueryEval)1 TaggingEval (edu.stanford.nlp.parser.metrics.TaggingEval)1