Search in sources :

Example 6 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class NERDemo method main.

public static void main(String[] args) throws Exception {
    String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
    if (args.length > 0) {
        serializedClassifier = args[0];
    AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier);
    if (args.length > 1) {
        /* For the file, it shows (1) how to run NER on a String, (2) how
         to get the entities in the String with character offsets, and
         (3) how to run NER on a whole file (without loading it into a String).
        String fileContents = IOUtils.slurpFile(args[1]);
        List<List<CoreLabel>> out = classifier.classify(fileContents);
        for (List<CoreLabel> sentence : out) {
            for (CoreLabel word : sentence) {
                System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        out = classifier.classifyFile(args[1]);
        for (List<CoreLabel> sentence : out) {
            for (CoreLabel word : sentence) {
                System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents);
        for (Triple<String, Integer, Integer> item : list) {
            System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third()));
        System.out.println("Ten best entity labelings");
        DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter();
        classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);
        System.out.println("Per-token marginalized probabilities");
        classifier.printProbs(args[1], readerAndWriter);
    // -- This code prints out the first order (token pair) clique probabilities.
    // -- But that output is a bit overwhelming, so we leave it commented out by default.
    // System.out.println("---");
    // System.out.println("First Order Clique Probabilities");
    // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);
    } else {
        /* For the hard-coded String, it shows how to run it on a single
         sentence, and how to do this and produce several formats, including
         slash tags and an inline XML output format. It also shows the full
         contents of the {@code CoreLabel}s that are constructed by the
         classifier. And it shows getting out the probabilities of different
         assignments and an n-best list of classifications with probabilities.
        String[] example = { "Good afternoon Rajat Raina, how are you today?", "I go to school at Stanford University, which is located in California." };
        for (String str : example) {
        for (String str : example) {
            // This one puts in spaces and newlines between tokens, so just print not println.
            System.out.print(classifier.classifyToString(str, "slashTags", false));
        for (String str : example) {
            // This one is best for dealing with the output as a TSV (tab-separated column) file.
            // The first column gives entities, the second their classes, and the third the remaining text in a document
            System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
        for (String str : example) {
        for (String str : example) {
            System.out.println(classifier.classifyToString(str, "xml", true));
        for (String str : example) {
            System.out.print(classifier.classifyToString(str, "tsv", false));
        // This gets out entities with character offsets
        int j = 0;
        for (String str : example) {
            List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str);
            for (Triple<String, Integer, Integer> trip : triples) {
                System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n", trip.first(), trip.second(), trip.third, j);
        // This prints out all the details of what is stored for each token
        int i = 0;
        for (String str : example) {
            for (List<CoreLabel> lcl : classifier.classify(str)) {
                for (CoreLabel cl : lcl) {
                    System.out.print(i++ + ": ");
Also used : Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) List(java.util.List)

Example 7 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class ChineseSimWordAvgDepGrammar method probTBwithSimWords.

  ** An alternative kind of smoothing.
  ** The first one is "probSimilarWordAvg" implemented by Galen
  ** This one is trying to modify "probTB" in MLEDependencyGrammar using the simWords list we have
  ** -pichuan
private double probTBwithSimWords(IntDependency dependency) {
    boolean leftHeaded = dependency.leftHeaded && directional;
    IntTaggedWord unknownHead = new IntTaggedWord(-1, dependency.head.tag);
    IntTaggedWord unknownArg = new IntTaggedWord(-1, dependency.arg.tag);
    if (verbose) {
        System.out.println("Generating " + dependency);
    short distance = dependency.distance;
    // int hW = dependency.head.word;
    // int aW = dependency.arg.word;
    IntTaggedWord aTW = dependency.arg;
    // IntTaggedWord hTW = dependency.head;
    double pb_stop_hTWds = getStopProb(dependency);
    boolean isRoot = rootTW(dependency.head);
    if (dependency.arg.word == -2) {
        // did we generate stop?
        if (isRoot) {
            return 0.0;
        return pb_stop_hTWds;
    double pb_go_hTWds = 1.0 - pb_stop_hTWds;
    if (isRoot) {
        pb_go_hTWds = 1.0;
    // generate the argument
    int valenceBinDistance = valenceBin(distance);
    // KEY:
    // c_     count of
    // p_     MLE prob of
    // pb_    MAP prob of
    // a      arg
    // h      head
    // T      tag
    // W      word
    // d      direction
    // ds     distance
    IntDependency temp = new IntDependency(dependency.head, dependency.arg, leftHeaded, valenceBinDistance);
    double c_aTW_hTWd = argCounter.getCount(temp);
    temp = new IntDependency(dependency.head, unknownArg, leftHeaded, valenceBinDistance);
    double c_aT_hTWd = argCounter.getCount(temp);
    temp = new IntDependency(dependency.head, wildTW, leftHeaded, valenceBinDistance);
    double c_hTWd = argCounter.getCount(temp);
    temp = new IntDependency(unknownHead, dependency.arg, leftHeaded, valenceBinDistance);
    double c_aTW_hTd = argCounter.getCount(temp);
    temp = new IntDependency(unknownHead, unknownArg, leftHeaded, valenceBinDistance);
    double c_aT_hTd = argCounter.getCount(temp);
    temp = new IntDependency(unknownHead, wildTW, leftHeaded, valenceBinDistance);
    double c_hTd = argCounter.getCount(temp);
    temp = new IntDependency(wildTW, dependency.arg, false, -1);
    double c_aTW = argCounter.getCount(temp);
    temp = new IntDependency(wildTW, unknownArg, false, -1);
    double c_aT = argCounter.getCount(temp);
    // do the magic
    double p_aTW_hTd = (c_hTd > 0.0 ? c_aTW_hTd / c_hTd : 0.0);
    double p_aT_hTd = (c_hTd > 0.0 ? c_aT_hTd / c_hTd : 0.0);
    double p_aTW_aT = (c_aTW > 0.0 ? c_aTW / c_aT : 1.0);
    // = (c_aTW_hTWd + smooth_aTW_hTWd * p_aTW_hTd) / (c_hTWd + smooth_aTW_hTWd);
    double pb_aTW_hTWd;
    double pb_aT_hTWd = (c_aT_hTWd + smooth_aT_hTWd * p_aT_hTd) / (c_hTWd + smooth_aT_hTWd);
    // = (interp * pb_aTW_hTWd + (1.0 - interp) * p_aTW_aT * pb_aT_hTWd) * pb_go_hTWds;
    double score;
    /* smooth by simWords -pichuan */
    List<Triple<Integer, String, Double>> sim2arg = simArgMap.get(new Pair<>(dependency.arg.word, stringBasicCategory(dependency.arg.tag)));
    List<Triple<Integer, String, Double>> sim2head = simHeadMap.get(new Pair<>(dependency.head.word, stringBasicCategory(dependency.head.tag)));
    List<Integer> simArg = new ArrayList<>();
    List<Integer> simHead = new ArrayList<>();
    if (sim2arg != null) {
        for (Triple<Integer, String, Double> t : sim2arg) {
    if (sim2head != null) {
        for (Triple<Integer, String, Double> t : sim2head) {
    double cSim_aTW_hTd = 0;
    double cSim_hTd = 0;
    for (int h : simHead) {
        IntTaggedWord hWord = new IntTaggedWord(h, dependency.head.tag);
        temp = new IntDependency(hWord, dependency.arg, dependency.leftHeaded, dependency.distance);
        cSim_aTW_hTd += argCounter.getCount(temp);
        temp = new IntDependency(hWord, wildTW, dependency.leftHeaded, dependency.distance);
        cSim_hTd += argCounter.getCount(temp);
    // P(Wa,Ta|Th)
    double pSim_aTW_hTd = (cSim_hTd > 0.0 ? cSim_aTW_hTd / cSim_hTd : 0.0);
    if (debug) {
        //if (simHead.size() > 0 && cSim_hTd == 0.0) {
        if (pSim_aTW_hTd > 0.0) {
            //System.out.println("# simHead("+dependency.head.word+"-"+wordNumberer.object(dependency.head.word)+") =\t"+cSim_hTd);
            System.out.println(dependency + "\t" + pSim_aTW_hTd);
    //pb_aTW_hTWd = (c_aTW_hTWd + smooth_aTW_hTWd * pSim_aTW_hTd + smooth_aTW_hTWd * p_aTW_hTd) / (c_hTWd + smooth_aTW_hTWd + smooth_aTW_hTWd);
    //if (pSim_aTW_hTd > 0.0) {
    double smoothSim_aTW_hTWd = 17.7;
    double smooth_aTW_hTWd = 17.7 * 2;
    //smooth_aTW_hTWd = smooth_aTW_hTWd*2;
    pb_aTW_hTWd = (c_aTW_hTWd + smoothSim_aTW_hTWd * pSim_aTW_hTd + smooth_aTW_hTWd * p_aTW_hTd) / (c_hTWd + smoothSim_aTW_hTWd + smooth_aTW_hTWd);
    System.out.println(c_aTW_hTWd + " + " + smoothSim_aTW_hTWd + " * " + pSim_aTW_hTd + " + " + smooth_aTW_hTWd + " * " + p_aTW_hTd);
    System.out.println("--------------------------------  = " + pb_aTW_hTWd);
    System.out.println(c_hTWd + " + " + smoothSim_aTW_hTWd + " + " + smooth_aTW_hTWd);
    //pb_aT_hTWd = (c_aT_hTWd + smooth_aT_hTWd * p_aT_hTd) / (c_hTWd + smooth_aT_hTWd);
    score = (interp * pb_aTW_hTWd + (1.0 - interp) * p_aTW_aT * pb_aT_hTWd) * pb_go_hTWds;
    if (verbose) {
        NumberFormat nf = NumberFormat.getNumberInstance();
        System.out.println("  c_aTW_hTWd: " + c_aTW_hTWd + "; c_aT_hTWd: " + c_aT_hTWd + "; c_hTWd: " + c_hTWd);
        System.out.println("  c_aTW_hTd: " + c_aTW_hTd + "; c_aT_hTd: " + c_aT_hTd + "; c_hTd: " + c_hTd);
        System.out.println("  Generated with pb_go_hTWds: " + nf.format(pb_go_hTWds) + " pb_aTW_hTWd: " + nf.format(pb_aTW_hTWd) + " p_aTW_aT: " + nf.format(p_aTW_aT) + " pb_aT_hTWd: " + nf.format(pb_aT_hTWd));
        System.out.println("  NoDist score: " + score);
    if (op.testOptions.prunePunc && pruneTW(aTW)) {
        return 1.0;
    if (Double.isNaN(score)) {
        score = 0.0;
    if (score < MIN_PROBABILITY) {
        score = 0.0;
    return score;
Also used : Triple(edu.stanford.nlp.util.Triple) NumberFormat(java.text.NumberFormat)

Example 8 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class LexicalizedParser method main.

   * A main program for using the parser with various options.
   * This program can be used for building and serializing
   * a parser from treebank data, for parsing sentences from a file
   * or URL using a serialized or text grammar parser,
   * and (mainly for parser quality testing)
   * for training and testing a parser on a treebank all in one go.
   * <p>
   * Sample Usages:
   * <ul>
   *   <li> <b>Train a parser (saved to <i>serializedGrammarFilename</i>)
   *      from a directory of trees (<i>trainFilesPath</i>, with an optional <i>fileRange</i>, e.g., 0-1000):</b>
   *    {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -saveToSerializedFile serializedGrammarFilename}
   *   </li>
   *   <li> <b>Train a parser (not saved) from a directory of trees, and test it (reporting scores) on a directory of trees</b>
   *    {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -testTreebank testFilePath [fileRange] }
   *   </li>
   *   <li> <b>Parse one or more files, given a serialized grammar and a list of files</b>
   *    {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] serializedGrammarPath filename [filename]*}
   *   </li>
   *   <li> <b>Test and report scores for a serialized grammar on trees in an output directory</b>
   *    {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -loadFromSerializedFile serializedGrammarPath -testTreebank testFilePath [fileRange]}
   *   </li>
   * </ul>
   * If the {@code serializedGrammarPath} ends in {@code .gz},
   * then the grammar is written and read as a compressed file (GZip).
   * If the {@code serializedGrammarPath} is a URL, starting with
   * {@code http://}, then the parser is read from the URL.
   * A fileRange specifies a numeric value that must be included within a
   * filename for it to be used in training or testing (this works well with
   * most current treebanks).  It can be specified like a range of pages to be
   * printed, for instance as {@code 200-2199} or
   * {@code 1-300,500-725,9000} or just as {@code 1} (if all your
   * trees are in a single file, either omit this parameter or just give a dummy
   * argument such as {@code 0}).
   * If the filename to parse is "-" then the parser parses from stdin.
   * If no files are supplied to parse, then a hardwired sentence
   * is parsed.
   * <p>
   * The parser can write a grammar as either a serialized Java object file
   * or in a text format (or as both), specified with the following options:
   * <blockquote>{@code
   * java edu.stanford.nlp.parser.lexparser.LexicalizedParser
   * [-v] -train
   * trainFilesPath [fileRange] [-saveToSerializedFile grammarPath]
   * [-saveToTextFile grammarPath]
   * }</blockquote>
   * <p>
   * In the same position as the verbose flag ({@code -v}), many other
   * options can be specified.  The most useful to an end user are:
   * <ul>
   * <LI>{@code -tLPP class} Specify a different
   * TreebankLangParserParams, for when using a different language or
   * treebank (the default is English Penn Treebank). <i>This option MUST occur
   * before any other language-specific options that are used (or else they
   * are ignored!).</i>
   * (It's usually a good idea to specify this option even when loading a
   * serialized grammar; it is necessary if the language pack specifies a
   * needed character encoding or you wish to specify language-specific
   * options on the command line.)</LI>
   * <LI>{@code -encoding charset} Specify the character encoding of the
   * input and output files.  This will override the value in the
   * {@code TreebankLangParserParams}, provided this option appears
   * <i>after</i> any {@code -tLPP} option.</LI>
   * <LI>{@code -tokenized} Says that the input is already separated
   * into whitespace-delimited tokens.  If this option is specified, any
   * tokenizer specified for the language is ignored, and a universal (Unicode)
   * tokenizer, which divides only on whitespace, is used.
   * Unless you also specify
   * {@code -escaper}, the tokens <i>must</i> all be correctly
   * tokenized tokens of the appropriate treebank for the parser to work
   * well (for instance, if using the Penn English Treebank, you must have
   * coded "(" as "-LRB-", "3/4" as "3\/4", etc.)</LI>
   * <li>{@code -escaper class} Specify a class of type
   * {@link Function}&lt;List&lt;HasWord&gt;,List&lt;HasWord&gt;&gt; to do
   * customized escaping of tokenized text.  This class will be run over the
   * tokenized text and can fix the representation of tokens. For instance,
   * it could change "(" to "-LRB-" for the Penn English Treebank.  A
   * provided escaper that does such things for the Penn English Treebank is
   * {@code edu.stanford.nlp.process.PTBEscapingProcessor}
   * <li>{@code -tokenizerFactory class} Specifies a
   * TokenizerFactory class to be used for tokenization</li>
   * <li>{@code -tokenizerOptions options} Specifies options to a
   * TokenizerFactory class to be used for tokenization.   A comma-separated
   * list. For PTBTokenizer, options of interest include
   * {@code americanize=false} and {@code asciiQuotes} (for German).
   * Note that any choice of tokenizer options that conflicts with the
   * tokenization used in the parser training data will likely degrade parser
   * performance. </li>
   * <li>{@code -sentences token } Specifies a token that marks sentence
   * boundaries.  A value of {@code newline} causes sentence breaking on
   * newlines.  A value of {@code onePerElement} causes each element
   * (using the XML {@code -parseInside} option) to be treated as a
   * sentence. All other tokens will be interpreted literally, and must be
   * exactly the same as tokens returned by the tokenizer.  For example,
   * you might specify "|||" and put that symbol sequence as a token between
   * sentences.
   * If no explicit sentence breaking option is chosen, sentence breaking
   * is done based on a set of language-particular sentence-ending patterns.
   * </li>
   * <LI>{@code -parseInside element} Specifies that parsing should only
   * be done for tokens inside the indicated XML-style
   * elements (done as simple pattern matching, rather than XML parsing).
   * For example, if this is specified as {@code sentence}, then
   * the text inside the {@code sentence} element
   * would be parsed.
   * Using "-parseInside s" gives you support for the input format of
   * Charniak's parser. Sentences cannot span elements. Whether the
   * contents of the element are treated as one sentence or potentially
   * multiple sentences is controlled by the {@code -sentences} flag.
   * The default is potentially multiple sentences.
   * This option gives support for extracting and parsing
   * text from very simple SGML and XML documents, and is provided as a
   * user convenience for that purpose. If you want to really parse XML
   * documents before NLP parsing them, you should use an XML parser, and then
   * call to a LexicalizedParser on appropriate CDATA.
   * <LI>{@code -tagSeparator char} Specifies to look for tags on words
   * following the word and separated from it by a special character
   * {@code char}.  For instance, many tagged corpora have the
   * representation "house/NN" and you would use {@code -tagSeparator /}.
   * Notes: This option requires that the input be pretokenized.
   * The separator has to be only a single character, and there is no
   * escaping mechanism. However, splitting is done on the <i>last</i>
   * instance of the character in the token, so that cases like
   * "3\/4/CD" are handled correctly.  The parser will in all normal
   * circumstances use the tag you provide, but will override it in the
   * case of very common words in cases where the tag that you provide
   * is not one that it regards as a possible tagging for the word.
   * The parser supports a format where only some of the words in a sentence
   * have a tag (if you are calling the parser programmatically, you indicate
   * them by having them implement the {@code HasTag} interface).
   * You can do this at the command-line by only having tags after some words,
   * but you are limited by the fact that there is no way to escape the
   * tagSeparator character.</LI>
   * <LI>{@code -maxLength leng} Specify the longest sentence that
   * will be parsed (and hence indirectly the amount of memory
   * needed for the parser). If this is not specified, the parser will
   * try to dynamically grow its parse chart when long sentence are
   * encountered, but may run out of memory trying to do so.</LI>
   * <LI>{@code -outputFormat styles} Choose the style(s) of output
   * sentences: {@code penn} for prettyprinting as in the Penn
   * treebank files, or {@code oneline} for printing sentences one
   * per line, {@code words}, {@code wordsAndTags},
   * {@code dependencies}, {@code typedDependencies},
   * or {@code typedDependenciesCollapsed}.
   * Multiple options may be specified as a comma-separated
   * list.  See TreePrint class for further documentation.</LI>
   * <LI>{@code -outputFormatOptions} Provide options that control the
   * behavior of various {@code -outputFormat} choices, such as
   * {@code lexicalize}, {@code stem}, {@code markHeadNodes},
   * or {@code xml}.  {@link edu.stanford.nlp.trees.TreePrint}
   * Options are specified as a comma-separated list.</LI>
   * <LI>{@code -writeOutputFiles} Write output files corresponding
   * to the input files, with the same name but a {@code ".stp"}
   * file extension.  The format of these files depends on the
   * {@code outputFormat} option.  (If not specified, output is sent
   * to stdout.)</LI>
   * <LI>{@code -outputFilesExtension} The extension that is appended to
   * the filename that is being parsed to produce an output file name (with the
   * -writeOutputFiles option). The default is {@code stp}.  Don't
   * include the period.
   * <LI>{@code -outputFilesDirectory} The directory in which output
   * files are written (when the -writeOutputFiles option is specified).
   * If not specified, output files are written in the same directory as the
   * input files.
   * <LI>{@code -nthreads} Parsing files and testing on treebanks
   * can use multiple threads.  This option tells the parser how many
   * threads to use.  A negative number indicates to use as many
   * threads as the machine has cores.
   * </ul>
   * See also the package documentation for more details and examples of use.
   * @param args Command line arguments, as above
public static void main(String[] args) {
    boolean train = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPath = null;
    Treebank testTreebank = null;
    Treebank tuneTreebank = null;
    String testPath = null;
    FileFilter testFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilter = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;
    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    // whether or not the input file has already been tokenized
    boolean tokenized = false;
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {"Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
    Options op = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encoding = null;
    // while loop through option arguments
    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            train = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPath = treebankDescription.first();
            trainFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-train2")) {
            // train = true;     // cdm july 2005: should require -train for this
            Triple<String, FileFilter, Double> treebankDescription = ArgUtils.getWeightedTreebankDescription(args, argIndex, "-train2");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            secondaryTreebankPath = treebankDescription.first();
            secondaryTrainFilter = treebankDescription.second();
            secondaryTreebankWeight = treebankDescription.third();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
      "Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
      "Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
      "Illegal access" + e);
                throw new RuntimeException(e);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encoding = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {
            tokenized = true;
            argIndex += 1;
        } else if (args[argIndex].equalsIgnoreCase("-escaper")) {
            try {
                escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);
            } catch (Exception e) {
      "Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {
            tokenizerOptions = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {
            tokenizerFactoryClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {
            tokenizerMethod = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentences")) {
            sentenceDelimiter = args[argIndex + 1];
            if (sentenceDelimiter.equalsIgnoreCase("newline")) {
                sentenceDelimiter = "\n";
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {
            elementDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {
            tagDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) {
            // load the parser from a binary serialized file
            // the next argument must be the path to the parser file
            serializedInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
            // load the parser from declarative text file
            // the next argument must be the path to the parser file
            textInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
            saveToSerializedFile = true;
            if (ArgUtils.numSubArgs(args, argIndex) < 1) {
      "Missing path: -saveToSerialized filename");
            } else {
                serializedOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
            // save the parser to declarative text file
            saveToTextFile = true;
            textOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {
            // save the training trees to a binary file
            op.trainOptions.trainTreeFile = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPath = treebankDescription.first();
            testFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tune")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            tunePath = treebankDescription.first();
            tuneFilter = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = op.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
    if (tuneFilter != null || tunePath != null) {
        if (tunePath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No tune treebank path specified...");
            } else {
      "No tune treebank path specified.  Using train path: \"" + treebankPath + '\"');
                tunePath = treebankPath;
        tuneTreebank = op.tlpParams.testMemoryTreebank();
        tuneTreebank.loadPath(tunePath, tuneFilter);
    if (!train && op.testOptions.verbose) {
        StringUtils.logInvocationString(log, args);
    // always initialized in next if-then-else block
    LexicalizedParser lp;
    if (train) {
        StringUtils.logInvocationString(log, args);
        // so we train a parser using the treebank
        GrammarCompactor compactor = null;
        if (op.trainOptions.compactGrammar() == 3) {
            compactor = new ExactGrammarCompactor(op, false, false);
        Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
        Treebank secondaryTrainTreebank = null;
        if (secondaryTreebankPath != null) {
            secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);
        List<List<TaggedWord>> extraTaggedWords = null;
        if (op.trainOptions.taggedFiles != null) {
            extraTaggedWords = new ArrayList<>();
            List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles);
            for (TaggedFileRecord record : fileRecords) {
                for (List<TaggedWord> sentence : record.reader()) {
        lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords);
    } else if (textInputFileOrUrl != null) {
        // so we load the parser from a text grammar file
        lp = getParserFromTextFile(textInputFileOrUrl, op);
    } else {
        // so we load a serialized parser
        if (serializedInputFileOrUrl == null && argIndex < args.length) {
            // the next argument must be the path to the serialized parser
            serializedInputFileOrUrl = args[argIndex];
        if (serializedInputFileOrUrl == null) {
  "No grammar specified, exiting...");
        String[] extraArgs = new String[optionArgs.size()];
        extraArgs = optionArgs.toArray(extraArgs);
        try {
            lp = loadModel(serializedInputFileOrUrl, op, extraArgs);
            op = lp.op;
        } catch (IllegalArgumentException e) {
  "Error loading parser, exiting...");
            throw e;
    // set up tokenizerFactory with options if provided
    if (tokenizerFactoryClass != null || tokenizerOptions != null) {
        try {
            if (tokenizerFactoryClass != null) {
                Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(tokenizerFactoryClass));
                Method factoryMethod;
                if (tokenizerOptions != null) {
                    factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class);
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));
                } else {
                    factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory");
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));
            } else {
                // have options but no tokenizer factory.  use the parser
                // langpack's factory and set its options
                tokenizerFactory = lp.op.langpack().getTokenizerFactory();
        } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) {
  "Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions);
            throw new RuntimeException(e);
    if (encoding != null) {
    if (testFilter != null || testPath != null) {
        if (testPath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
      "No test treebank path specified.  Using train path: \"" + treebankPath + '\"');
                testPath = treebankPath;
        testTreebank = op.tlpParams.testMemoryTreebank();
        testTreebank.loadPath(testPath, testFilter);
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    // Now what do we do with the parser we've made
    if (saveToTextFile) {
        // save the parser to textGrammar format
        if (textOutputFileOrUrl != null) {
        } else {
  "Usage: must specify a text grammar output path");
    if (saveToSerializedFile) {
        if (serializedOutputFileOrUrl != null) {
        } else if (textOutputFileOrUrl == null && testTreebank == null) {
            // no saving/parsing request has been specified
  "usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");
    if (op.testOptions.verbose || train) {
        // Tell the user a little or a lot about what we have made
        // get lexicon size separately as it may have its own prints in it....
        String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";"Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");"Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + ( != null ? : "") + '\t' + ( != null ? : "") + '\t' + lexNumRules);"ParserPack is " + op.tlpParams.getClass().getName());"Lexicon is " + lp.lex.getClass().getName());
        if (op.testOptions.verbose) {
  "Tags are: " + lp.tagIndex);
        //"States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)!
        printOptions(false, op);
    if (testTreebank != null) {
        // test parser on treebank
        EvaluateTreebank evaluator = new EvaluateTreebank(lp);
    } else if (argIndex >= args.length) {
        // no more arguments, so we just parse our own test sentence
        PrintWriter pwOut =;
        PrintWriter pwErr =;
        ParserQuery pq = lp.parserQuery();
        if (pq.parse(op.tlpParams.defaultTestSentence())) {
            lp.getTreePrint().printTree(pq.getBestParse(), pwOut);
        } else {
            pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence());
    } else {
        // We parse filenames given by the remaining arguments
        ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp);
Also used : TaggedFileRecord( Pair(edu.stanford.nlp.util.Pair) HasWord(edu.stanford.nlp.ling.HasWord) TokenizerFactory(edu.stanford.nlp.process.TokenizerFactory) Method(java.lang.reflect.Method) RuntimeIOException( InvocationTargetException(java.lang.reflect.InvocationTargetException) InvocationTargetException(java.lang.reflect.InvocationTargetException) Triple(edu.stanford.nlp.util.Triple) TaggedWord(edu.stanford.nlp.ling.TaggedWord) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery)

Example 9 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class TreeAnnotatorAndBinarizer method getAnnotatedBinaryTreebankFromTreebank.

/** @return A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.
public static Triple<Treebank, Treebank, Treebank> getAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) {
    // setup tree transforms
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
    if (op.testOptions.verbose) {
        PrintWriter pwErr =;
        pwErr.print("Training ");
        if (secondaryTreebank != null) {
            pwErr.print("Secondary training ");
    CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer();
    if (op.trainOptions.preTransformer != null) {
    if (op.trainOptions.collinsPunc) {
        CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }"Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
        binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), !op.trainOptions.predictSplits, op);
    } else {
        binarizer = new TreeAnnotatorAndBinarizer(tlpParams.headFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), !op.trainOptions.predictSplits, op);
    if (op.wordFunction != null) {
        TreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction);
    Treebank wholeTreebank;
    if (secondaryTreebank == null) {
        wholeTreebank = trainTreebank;
    } else {
        wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank);
    if (op.trainOptions.selectiveSplit) {
        op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp);
        removeDeleteSplittersFromSplitters(tlp, op);
        if (op.testOptions.verbose) {
            List<String> list = new ArrayList<>(op.trainOptions.splitters);
  "Parent split categories: " + list);
    if (op.trainOptions.selectivePostSplit) {
        // Do all the transformations once just to learn selective splits on annotated categories
        TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
        wholeTreebank = wholeTreebank.transform(myTransformer);
        op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp);
        if (op.testOptions.verbose) {
  "Parent post annotation split categories: " + op.trainOptions.postSplitters);
    if (op.trainOptions.hSelSplit) {
        // We run through all the trees once just to gather counts for hSelSplit!
        int ptt = op.trainOptions.printTreeTransformations;
        op.trainOptions.printTreeTransformations = 0;
        for (Tree tree : wholeTreebank) {
        op.trainOptions.printTreeTransformations = ptt;
    // we've done all the setup now. here's where the train treebank is transformed.
    trainTreebank = trainTreebank.transform(trainTransformer);
    if (secondaryTreebank != null) {
        secondaryTreebank = secondaryTreebank.transform(trainTransformer);
    if (op.trainOptions.printAnnotatedStateCounts) {
    if (op.trainOptions.printAnnotatedRuleCounts) {
    if (tuneTreebank != null) {
        tuneTreebank = tuneTreebank.transform(trainTransformer);
    if (op.testOptions.verbose) {
    return new Triple<>(trainTreebank, secondaryTreebank, tuneTreebank);
Also used : Triple(edu.stanford.nlp.util.Triple) PrintWriter(

Example 10 with Triple

use of edu.stanford.nlp.util.Triple in project CoreNLP by stanfordnlp.

the class Evalb method main.

   * Run the Evalb scoring metric on guess/gold input. The default language is English.
   * @param args
public static void main(String[] args) {
    if (args.length < minArgs) {;
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    final TreebankLangParserParams tlpp = language.params;
    final int maxGoldYield = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    final boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    final boolean sortByF1 = PropertiesUtils.hasProperty(options, "s");
    int worstKTreesToEmit = PropertiesUtils.getInt(options, "s", 0);
    PriorityQueue<Triple<Double, Tree, Tree>> queue = sortByF1 ? new PriorityQueue<>(2000, new F1Comparator()) : null;
    boolean doCatLevel = PropertiesUtils.getBool(options, "c", false);
    String labelRegex = options.getProperty("f", null);
    String encoding = options.getProperty("e", "UTF-8");
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != minArgs) {;
    String goldFile = parsedArgs[0];
    String guessFile = parsedArgs[1];
    // Command-line has been parsed. Configure the metric for evaluation.
    final PrintWriter pwOut =;
    final Treebank guessTreebank = tlpp.diskTreebank();
    pwOut.println("GUESS TREEBANK:");
    final Treebank goldTreebank = tlpp.diskTreebank();
    pwOut.println("GOLD TREEBANK:");
    final Evalb metric = new Evalb("Evalb LP/LR", true);
    final EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree =;
        List<Label> guessYield = guessTree.yield();
        Tree goldTree =;
        List<Label> goldYield = goldTree.yield();
        // Check that we should evaluate this tree
        if (goldYield.size() > maxGoldYield) {
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
        if (doCatLevel)
            evalbCat.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
        if (sortByF1)
            storeTrees(queue, guessTree, goldTree, metric.getLastF1());
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    if (doCatLevel) {
        evalbCat.display(true, pwOut);
    if (sortByF1)
        emitSortedTrees(queue, worstKTreesToEmit, guessFile);
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Triple(edu.stanford.nlp.util.Triple) Language( Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(


Triple (edu.stanford.nlp.util.Triple)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 Pair (edu.stanford.nlp.util.Pair)3 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)2 Tree (edu.stanford.nlp.trees.Tree)2 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)2 PrintWriter ( ArrayList (java.util.ArrayList)2 List (java.util.List)2 TransducerGraph (edu.stanford.nlp.fsm.TransducerGraph)1 Language ( RuntimeIOException ( CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 HasWord (edu.stanford.nlp.ling.HasWord)1 Label (edu.stanford.nlp.ling.Label)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 SequenceMatchResult (edu.stanford.nlp.ling.tokensregex.SequenceMatchResult)1 TokenSequenceMatcher (edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher)1 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)1 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)1