Search in sources :

Example 1 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class TsarfatyEval method main.

/**
   * Run the scoring metric on guess/gold input. This method performs "Collinization." 
   * The default language is English.
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    int maxGuessYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    boolean skipGuess = false;
    boolean tagMode = false;
    String guessFile = null;
    String goldFile = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-y":
                    maxGoldYield = Integer.parseInt(args[++i].trim());
                    break;
                case "-t":
                    tagMode = true;
                    break;
                case "-v":
                    VERBOSE = true;
                    break;
                case "-g":
                    maxGuessYield = Integer.parseInt(args[++i].trim());
                    skipGuess = true;
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            //Required parameters
            goldFile = args[i++];
            guessFile = args[i];
            break;
        }
    }
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
    final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        final ArrayList<Label> guessSent = guess.yield();
        final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
        if (guessSent.size() > maxGuessYield) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            final ArrayList<Label> goldSent = gold.yield();
            final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
            if (goldSent.size() > maxGoldYield) {
                continue;
            } else if (goldChars.length() != guessChars.length()) {
                pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    eval.display(true, pwOut);
    pwOut.println();
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) Label(edu.stanford.nlp.ling.Label) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 2 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class CollinsDepEval method main.

/**
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
    Language LANGUAGE = PropertiesUtils.get(options, "l", Language.English, Language.class);
    int MAX_GOLD_YIELD = PropertiesUtils.getInt(options, "g", Integer.MAX_VALUE);
    int MAX_GUESS_YIELD = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    File goldFile = new File(parsedArgs[0]);
    File guessFile = new File(parsedArgs[1]);
    final TreebankLangParserParams tlpp = LANGUAGE.params;
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());
    final TreeTransformer tc = tlpp.collinizer();
    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    int goldLineId = 0;
    int skippedGuessTrees = 0;
    for (final Tree guess : guessTreebank) {
        final Tree evalGuess = tc.transformTree(guess);
        if (guess.yield().size() > MAX_GUESS_YIELD) {
            skippedGuessTrees++;
            continue;
        }
        boolean doneEval = false;
        while (goldItr.hasNext() && !doneEval) {
            final Tree gold = goldItr.next();
            final Tree evalGold = tc.transformTree(gold);
            goldLineId++;
            if (gold.yield().size() > MAX_GOLD_YIELD) {
                continue;
            } else if (evalGold.yield().size() != evalGuess.yield().size()) {
                pwOut.println("Yield mismatch at gold line " + goldLineId);
                skippedGuessTrees++;
                //Default evalb behavior -- skip this guess tree
                break;
            }
            depEval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
            //Move to the next guess parse
            doneEval = true;
        }
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees\n", ((MAX_GUESS_YIELD < Integer.MAX_VALUE) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
    depEval.display(true, pwOut);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) File(java.io.File) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 3 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class LeafAncestorEval method main.

/**
   * Execute with no arguments for usage.
   */
public static void main(String[] args) {
    if (!validateCommandLine(args)) {
        log.info(USAGE);
        System.exit(-1);
    }
    final TreebankLangParserParams tlpp = LANGUAGE.params;
    final PrintWriter pwOut = tlpp.pw();
    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());
    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());
    final LeafAncestorEval metric = new LeafAncestorEval("LeafAncestor");
    final TreeTransformer tc = tlpp.collinizer();
    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
    //   2 - null parse e.g. (()).
    //
    //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
    final Iterator<Tree> goldItr = goldTreebank.iterator();
    final Iterator<Tree> guessItr = guessTreebank.iterator();
    int goldLineId = 0;
    int guessLineId = 0;
    int skippedGuessTrees = 0;
    while (guessItr.hasNext() && goldItr.hasNext()) {
        Tree guessTree = guessItr.next();
        List<Label> guessYield = guessTree.yield();
        guessLineId++;
        Tree goldTree = goldItr.next();
        List<Label> goldYield = goldTree.yield();
        goldLineId++;
        // Check that we should evaluate this tree
        if (goldYield.size() > MAX_GOLD_YIELD) {
            skippedGuessTrees++;
            continue;
        }
        // Only trees with equal yields can be evaluated
        if (goldYield.size() != guessYield.size()) {
            pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
            skippedGuessTrees++;
            continue;
        }
        final Tree evalGuess = tc.transformTree(guessTree);
        final Tree evalGold = tc.transformTree(goldTree);
        metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
    }
    if (guessItr.hasNext() || goldItr.hasNext()) {
        System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
    }
    pwOut.println("================================================================================");
    if (skippedGuessTrees != 0)
        pwOut.printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees);
    metric.display(true, pwOut);
    pwOut.close();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) Tree(edu.stanford.nlp.trees.Tree) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 4 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class GrammaticalStructureConversionUtils method convertTrees.

/**
   * Given sentences or trees, output the typed dependencies.
   * <p>

   * By default, the method outputs the collapsed typed dependencies with
   * processing of conjuncts. The input can be given as plain text (one sentence
   * by line) using the option -sentFile, or as trees using the option
   * -treeFile. For -sentFile, the input has to be strictly one sentence per
   * line. You can specify where to find a parser with -parserFile
   * serializedParserPath. See LexicalizedParser for more flexible processing of
   * text files (including with Stanford Dependencies output). The above options
   * assume a file as input. You can also feed trees (only) via stdin by using
   * the option -filter.  If one does not specify a -parserFile, one
   * can specify which language pack to use with -tLPP, This option
   * specifies a class which determines which GrammaticalStructure to
   * use, which HeadFinder to use, etc.  It will default to
   * edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams,
   * but any TreebankLangParserParams can be specified.
   * <p>
   * If no method of producing trees is given other than to use the
   * LexicalizedParser, but no parser is specified, a default parser
   * is used, the English parser.  You can specify options to load
   * with the parser using the -parserOpts flag.  If the default
   * parser is used, and no options are provided, the option
   * -retainTmpSubcategories is used.
   * <p>
   * The following options can be used to specify the types of dependencies
   * wanted: </p>
   * <ul>
   * <li> -collapsed collapsed dependencies
   * <li> -basic non-collapsed dependencies that preserve a tree structure
   * <li> -nonCollapsed non-collapsed dependencies that do not preserve a tree
   * structure (the basic dependencies plus the extra ones)
   * <li> -CCprocessed
   * collapsed dependencies and conjunctions processed (dependencies are added
   * for each conjunct) -- this is the default if no options are passed
   * <li> -collapsedTree collapsed dependencies retaining a tree structure
   * <li> -makeCopulaHead Contrary to the approach argued for in the SD papers,
   *  nevertheless make the verb 'to be' the head, not the predicate noun, adjective,
   *  etc. (However, when the verb 'to be' is used as an auxiliary verb, the main
   *  verb is still treated as the head.)
   * <li> -originalDependencies generate the dependencies using the original converter
   * instead of the Universal Dependencies converter.
   * </ul>
   * <p>
   * The {@code -conllx} option will output the dependencies in the CoNLL format,
   * instead of in the standard Stanford format (relation(governor,dependent))
   * and will retain punctuation by default.
   * When used in the "collapsed" format, words such as prepositions, conjunctions
   * which get collapsed into the grammatical relations and are not part of the
   * sentence per se anymore will be annotated with "erased" as grammatical relation
   * and attached to the fake "ROOT" node with index 0.
   * <p/><p>
   * There is also an option to retain dependencies involving punctuation:
   * {@code -keepPunct}
   * </p><p>
   * The {@code -extraSep} option used with -nonCollapsed will print the basic
   * dependencies first, then a separator ======, and then the extra
   * dependencies that do not preserve the tree structure. The -test option is
   * used for debugging: it prints the grammatical structure, as well as the
   * basic, collapsed and CCprocessed dependencies. It also checks the
   * connectivity of the collapsed dependencies. If the collapsed dependencies
   * list doesn't constitute a connected graph, it prints the possible offending
   * nodes (one of them is the real root of the graph).
   * </p><p>
   * Using the -conllxFile, you can pass a file containing Stanford dependencies
   * in the CoNLL format (e.g., the basic dependencies), and obtain another
   * representation using one of the representation options.
   * </p><p>
   * Usage: <br>
   * <code>java edu.stanford.nlp.trees.GrammaticalStructure [-treeFile FILE | -sentFile FILE | -conllxFile FILE | -filter] <br>
   * [-collapsed -basic -CCprocessed -test -generateOriginalDependencies]</code>
   *
   * @param args Command-line arguments, as above
   */
@SuppressWarnings("unchecked")
public static void convertTrees(String[] args, String defaultLang) {
    /* Use a tree normalizer that removes all empty nodes.
       This prevents wrong indexing of the nodes in the dependency relations. */
    Iterable<GrammaticalStructure> gsBank = null;
    Properties props = StringUtils.argsToProperties(args);
    String language = props.getProperty("language", defaultLang);
    ConverterOptions opts = ConverterOptions.getConverterOptions(language);
    MemoryTreebank tb = new MemoryTreebank(opts.treeNormalizer);
    Iterable<Tree> trees = tb;
    String encoding = props.getProperty("encoding", "utf-8");
    try {
        System.setOut(new PrintStream(System.out, true, encoding));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String treeFileName = props.getProperty("treeFile");
    String sentFileName = props.getProperty("sentFile");
    String conllXFileName = props.getProperty("conllxFile");
    String altDepPrinterName = props.getProperty("altprinter");
    String altDepReaderName = props.getProperty("altreader");
    String altDepReaderFilename = props.getProperty("altreaderfile");
    String filter = props.getProperty("filter");
    boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null;
    boolean generateOriginalDependencies = props.getProperty("originalDependencies") != null || opts.stanfordDependencies;
    // TODO: if a parser is specified, load this from the parser
    // instead of ever loading it from this way
    String tLPP = props.getProperty("tLPP", opts.tlPPClassName);
    TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP);
    params.setGenerateOriginalDependencies(generateOriginalDependencies);
    if (makeCopulaHead) {
        // TODO: generalize and allow for more options
        String[] options = { "-makeCopulaHead" };
        params.setOptionFlag(options, 0);
    }
    if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) {
        try {
            System.err.printf("Usage: java %s%n", GrammaticalStructure.class.getCanonicalName());
            System.err.println("Options:");
            System.err.println("  Dependency representation:");
            System.err.println("    -basic:\t\tGenerate basic dependencies.");
            System.err.println("    -enhanced:\t\tGenerate enhanced dependencies, currently only implemented for English UD.");
            System.err.println("    -enhanced++:\tGenerate enhanced++ dependencies (default), currently only implemented for English UD.");
            System.err.println("    -collapsed:\t\tGenerate collapsed dependencies, deprecated.");
            System.err.println("    -CCprocessed:\tGenerate CC-processed dependencies, deprecated.");
            System.err.println("    -collapsedTree:\tGenerate collapsed-tree dependencies, deprecated.");
            System.err.println("");
            System.err.println("  Input:");
            System.err.println("    -treeFile <FILE>:\tConvert from constituency trees in <FILE>");
            System.err.println("    -sentFile <FILE>:\tParse and convert sentences from <FILE>. Only implemented for English.");
            System.err.println("");
            System.err.println("  Output:");
            System.err.println("    -conllx:\t\tOutput dependencies in CoNLL format.");
            System.err.println("");
            System.err.println("  Language:");
            System.err.println("    -language [en|zh|en-sd|zh-sd]:\t (Universal English Dependencies, Universal Chinese Dependencies, English Stanford Dependencies, Chinese Stanford Dependencies)");
            System.err.println("");
            System.err.println("");
            System.err.println("");
            System.err.println("Example:");
            TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))"));
            tb.add(tr.readTree());
        } catch (Exception e) {
            log.info("Horrible error: " + e);
            e.printStackTrace();
        }
    } else if (altDepReaderName != null && altDepReaderFilename != null) {
        DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName);
        try {
            gsBank = altDepReader.readDependencies(altDepReaderFilename);
        } catch (IOException e) {
            log.info("Error reading " + altDepReaderFilename);
            return;
        }
    } else if (treeFileName != null) {
        tb.loadPath(treeFileName);
    } else if (filter != null) {
        tb.load(IOUtils.readerFromStdin());
    } else if (conllXFileName != null) {
        try {
            gsBank = params.readGrammaticalStructureFromFile(conllXFileName);
        } catch (RuntimeIOException e) {
            log.info("Error reading " + conllXFileName);
            return;
        }
    } else {
        String parserFile = props.getProperty("parserFile");
        String parserOpts = props.getProperty("parserOpts");
        boolean tokenized = props.getProperty("tokenized") != null;
        Function<List<? extends HasWord>, Tree> lp = loadParser(parserFile, parserOpts, makeCopulaHead);
        trees = new LazyLoadTreesByParsing(sentFileName, encoding, tokenized, lp);
        // necessarily have to use LexicalizedParser
        try {
            Method method = lp.getClass().getMethod("getTLPParams");
            params = (TreebankLangParserParams) method.invoke(lp);
            params.setGenerateOriginalDependencies(generateOriginalDependencies);
        } catch (Exception cnfe) {
            throw new RuntimeException(cnfe);
        }
    }
    // treats the output according to the options passed
    boolean basic = props.getProperty("basic") != null;
    boolean collapsed = props.getProperty("collapsed") != null;
    boolean CCprocessed = props.getProperty("CCprocessed") != null;
    boolean collapsedTree = props.getProperty("collapsedTree") != null;
    boolean nonCollapsed = props.getProperty("nonCollapsed") != null;
    boolean extraSep = props.getProperty("extraSep") != null;
    boolean parseTree = props.getProperty("parseTree") != null;
    boolean test = props.getProperty("test") != null;
    //always keep punctuation marks
    boolean keepPunct = true;
    boolean conllx = props.getProperty("conllx") != null;
    // todo: Support checkConnected on more options (including basic)
    boolean checkConnected = props.getProperty("checkConnected") != null;
    boolean portray = props.getProperty("portray") != null;
    boolean enhanced = props.getProperty("enhanced") != null;
    boolean enhancedPlusPlus = props.getProperty("enhanced++") != null;
    // If requested load alternative printer
    DependencyPrinter altDepPrinter = null;
    if (altDepPrinterName != null) {
        altDepPrinter = loadAlternateDependencyPrinter(altDepPrinterName);
    }
    // log.info("First tree in tb is");
    // log.info(((MemoryTreebank) tb).get(0));
    Method m = null;
    if (test) {
        // Do this by reflection to avoid this becoming a dependency when we distribute the parser
        try {
            Class sgf = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphFactory");
            m = sgf.getDeclaredMethod("makeFromTree", GrammaticalStructure.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, Predicate.class, String.class, int.class);
        } catch (Exception e) {
            log.info("Test cannot check for cycles in tree format (classes not available)");
        }
    }
    if (gsBank == null) {
        gsBank = new TreeBankGrammaticalStructureWrapper(trees, keepPunct, params);
    }
    for (GrammaticalStructure gs : gsBank) {
        Tree tree;
        if (gsBank instanceof TreeBankGrammaticalStructureWrapper) {
            // log.info("Using TreeBankGrammaticalStructureWrapper branch");
            tree = ((TreeBankGrammaticalStructureWrapper) gsBank).getOriginalTree(gs);
        // log.info("Tree is: ");
        // log.info(t);
        } else {
            // log.info("Using gs.root() branch");
            // recover tree
            tree = gs.root();
        // log.info("Tree from gs is");
        // log.info(t);
        }
        if (test) {
            // print the grammatical structure, the basic, collapsed and CCprocessed
            System.out.println("============= parse tree =======================");
            tree.pennPrint();
            System.out.println();
            System.out.println("------------- GrammaticalStructure -------------");
            System.out.println(gs);
            boolean allConnected = true;
            boolean connected;
            Collection<TypedDependency> bungRoots = null;
            System.out.println("------------- basic dependencies ---------------");
            List<TypedDependency> gsb = gs.typedDependencies(GrammaticalStructure.Extras.NONE);
            System.out.println(StringUtils.join(gsb, "\n"));
            connected = GrammaticalStructure.isConnected(gsb);
            if (!connected && bungRoots == null) {
                bungRoots = GrammaticalStructure.getRoots(gsb);
            }
            allConnected = connected && allConnected;
            System.out.println("------------- non-collapsed dependencies (basic + extra) ---------------");
            List<TypedDependency> gse = gs.typedDependencies(GrammaticalStructure.Extras.MAXIMAL);
            System.out.println(StringUtils.join(gse, "\n"));
            connected = GrammaticalStructure.isConnected(gse);
            if (!connected && bungRoots == null) {
                bungRoots = GrammaticalStructure.getRoots(gse);
            }
            allConnected = connected && allConnected;
            System.out.println("------------- collapsed dependencies -----------");
            System.out.println(StringUtils.join(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), "\n"));
            System.out.println("------------- collapsed dependencies tree -----------");
            System.out.println(StringUtils.join(gs.typedDependenciesCollapsedTree(), "\n"));
            System.out.println("------------- CCprocessed dependencies --------");
            List<TypedDependency> gscc = gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL);
            System.out.println(StringUtils.join(gscc, "\n"));
            System.out.println("-----------------------------------------------");
            // connectivity tests
            connected = GrammaticalStructure.isConnected(gscc);
            if (!connected && bungRoots == null) {
                bungRoots = GrammaticalStructure.getRoots(gscc);
            }
            allConnected = connected && allConnected;
            if (allConnected) {
                System.out.println("dependencies form connected graphs.");
            } else {
                System.out.println("dependency graph NOT connected! possible offending nodes: " + bungRoots);
            }
            // libraries
            if (m != null) {
                try {
                    // the first arg is null because it's a static method....
                    Object semGraph = m.invoke(null, gs, false, true, false, false, false, false, null, null, 0);
                    Class sg = Class.forName("edu.stanford.nlp.semgraph.SemanticGraph");
                    Method mDag = sg.getDeclaredMethod("isDag");
                    boolean isDag = (Boolean) mDag.invoke(semGraph);
                    System.out.println("tree dependencies form a DAG: " + isDag);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        } else // end of "test" output
        {
            if (parseTree) {
                System.out.println("============= parse tree =======================");
                tree.pennPrint();
                System.out.println();
            }
            if (basic) {
                if (collapsed || CCprocessed || collapsedTree || nonCollapsed || enhanced || enhancedPlusPlus) {
                    System.out.println("------------- basic dependencies ---------------");
                }
                if (altDepPrinter == null) {
                    printDependencies(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree, conllx, false, opts.convertToUPOS);
                } else {
                    System.out.println(altDepPrinter.dependenciesToString(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree));
                }
            }
            if (nonCollapsed) {
                if (basic || CCprocessed || collapsed || collapsedTree) {
                    System.out.println("----------- non-collapsed dependencies (basic + extra) -----------");
                }
                printDependencies(gs, gs.allTypedDependencies(), tree, conllx, extraSep, opts.convertToUPOS);
            }
            if (collapsed) {
                if (basic || CCprocessed || collapsedTree || nonCollapsed) {
                    System.out.println("----------- collapsed dependencies -----------");
                }
                printDependencies(gs, gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS);
            }
            if (CCprocessed) {
                if (basic || collapsed || collapsedTree || nonCollapsed) {
                    System.out.println("---------- CCprocessed dependencies ----------");
                }
                List<TypedDependency> deps = gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL);
                if (checkConnected) {
                    if (!GrammaticalStructure.isConnected(deps)) {
                        log.info("Graph is not connected for:");
                        log.info(tree);
                        log.info("possible offending nodes: " + GrammaticalStructure.getRoots(deps));
                    }
                }
                printDependencies(gs, deps, tree, conllx, false, opts.convertToUPOS);
            }
            if (collapsedTree) {
                if (basic || CCprocessed || collapsed || nonCollapsed) {
                    System.out.println("----------- collapsed dependencies tree -----------");
                }
                printDependencies(gs, gs.typedDependenciesCollapsedTree(), tree, conllx, false, opts.convertToUPOS);
            }
            if (enhanced) {
                if (basic || enhancedPlusPlus) {
                    System.out.println("----------- enhanced dependencies tree -----------");
                }
                printDependencies(gs, gs.typedDependenciesEnhanced(), tree, conllx, false, opts.convertToUPOS);
            }
            if (enhancedPlusPlus) {
                if (basic || enhanced) {
                    System.out.println("----------- enhanced++ dependencies tree -----------");
                }
                printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS);
            }
            // default use: enhanced++ for UD, CCprocessed for SD (to parallel what happens within the parser)
            if (!basic && !collapsed && !CCprocessed && !collapsedTree && !nonCollapsed && !enhanced && !enhancedPlusPlus) {
                if (generateOriginalDependencies) {
                    printDependencies(gs, gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS);
                } else {
                    printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS);
                }
            }
        }
        if (portray) {
            try {
                // put up a window showing it
                Class sgu = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphUtils");
                Method mRender = sgu.getDeclaredMethod("render", GrammaticalStructure.class, String.class);
                // the first arg is null because it's a static method....
                mRender.invoke(null, gs, "Collapsed, CC processed deps");
            } catch (Exception e) {
                throw new RuntimeException("Couldn't use swing to portray semantic graph", e);
            }
        }
    }
// end for
}
Also used : TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Predicate(java.util.function.Predicate) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) Method(java.lang.reflect.Method) InvocationTargetException(java.lang.reflect.InvocationTargetException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException)

Example 5 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class TreebankFactoredLexiconStats method main.

//  private static String stripTag(String tag) {
//    if (tag.startsWith("DT")) {
//      String newTag = tag.substring(2, tag.length());
//      return newTag.length() > 0 ? newTag : tag;
//    }
//    return tag;
//  }
/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
        System.exit(-1);
    }
    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    if (language.equals(Language.Arabic)) {
        String[] options = { "-arabicFactored" };
        tlpp.setOptionFlag(options, 0);
    } else {
        String[] options = { "-frenchFactored" };
        tlpp.setOptionFlag(options, 0);
    }
    Treebank tb = tlpp.diskTreebank();
    tb.loadPath(args[1]);
    MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
    String[] features = args[2].trim().split(",");
    for (String feature : features) {
        morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }
    // Counters
    Counter<String> wordTagCounter = new ClassicCounter<>(30000);
    Counter<String> morphTagCounter = new ClassicCounter<>(500);
    //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
    Counter<String> morphCounter = new ClassicCounter<>(500);
    Counter<String> wordCounter = new ClassicCounter<>(30000);
    Counter<String> tagCounter = new ClassicCounter<>(300);
    Counter<String> lemmaCounter = new ClassicCounter<>(25000);
    Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);
    Counter<String> richTagCounter = new ClassicCounter<>(1000);
    Counter<String> reducedTagCounter = new ClassicCounter<>(500);
    Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);
    Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();
    TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000);
    TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500);
    TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300);
    int numTrees = 0;
    for (Tree tree : tb) {
        for (Tree subTree : tree) {
            if (!subTree.isLeaf()) {
                tlpp.transformTree(subTree, tree);
            }
        }
        List<Label> pretermList = tree.preTerminalYield();
        List<Label> yield = tree.yield();
        assert yield.size() == pretermList.size();
        int yieldLen = yield.size();
        for (int i = 0; i < yieldLen; ++i) {
            String tag = pretermList.get(i).value();
            String word = yield.get(i).value();
            String morph = ((CoreLabel) yield.get(i)).originalText();
            // Note: if there is no lemma, then we use the surface form.
            Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
            String lemma = lemmaTag.first();
            String richTag = lemmaTag.second();
            // WSGDEBUG
            if (tag.contains("MW"))
                lemma += "-MWE";
            lemmaCounter.incrementCount(lemma);
            lemmaTagCounter.incrementCount(lemma + tag);
            richTagCounter.incrementCount(richTag);
            String reducedTag = morphoSpec.strToFeatures(richTag).toString();
            reducedTagCounter.incrementCount(reducedTag);
            reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
            wordTagCounter.incrementCount(word + tag);
            morphTagCounter.incrementCount(morph + tag);
            morphCounter.incrementCount(morph);
            wordCounter.incrementCount(word);
            tagCounter.incrementCount(tag);
            reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
            if (wordLemmaMap.containsKey(word)) {
                wordLemmaMap.get(word).add(lemma);
            } else {
                Set<String> lemmas = Generics.newHashSet(1);
                wordLemmaMap.put(word, lemmas);
            }
            lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
            reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
            tagReducedTagCounter.incrementCount(tag, reducedTag);
        }
        ++numTrees;
    }
    // Barf...
    System.out.println("Language: " + language.toString());
    System.out.printf("#trees:\t%d%n", numTrees);
    System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
    System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
    System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
    System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
    System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
    System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
    System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
    System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
    System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
    System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
    System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());
    // Extra
    System.out.println("==================");
    StringBuilder sbNoLemma = new StringBuilder();
    StringBuilder sbMultLemmas = new StringBuilder();
    for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
        String word = wordLemmas.getKey();
        Set<String> lemmas = wordLemmas.getValue();
        if (lemmas.size() == 0) {
            sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
            continue;
        }
        if (lemmas.size() > 1) {
            sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
            continue;
        }
        String lemma = lemmas.iterator().next();
        Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
        if (reducedTags.size() > 1) {
            System.out.printf("%s --> %s%n", word, lemma);
            for (String reducedTag : reducedTags) {
                int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
                String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
                System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
            }
            System.out.println();
        }
    }
    System.out.println("==================");
    System.out.println(sbNoLemma.toString());
    System.out.println(sbMultLemmas.toString());
    System.out.println("==================");
    List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
    Collections.sort(tags);
    for (String tag : tags) {
        System.out.println(tag);
        Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
        for (String reducedTag : reducedTags) {
            int count = tagReducedTagCounter.getCount(tag, reducedTag);
            //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
            System.out.printf("\t%s\t%d%n", reducedTag, count);
        }
        System.out.println();
    }
    System.out.println("==================");
}
Also used : FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) Set(java.util.Set) Treebank(edu.stanford.nlp.trees.Treebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) FrenchMorphoFeatureSpecification(edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification) TwoDimensionalIntCounter(edu.stanford.nlp.stats.TwoDimensionalIntCounter) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Map(java.util.Map)

Aggregations

TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)17 Tree (edu.stanford.nlp.trees.Tree)15 Language (edu.stanford.nlp.international.Language)14 PrintWriter (java.io.PrintWriter)13 Label (edu.stanford.nlp.ling.Label)8 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)8 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)7 Treebank (edu.stanford.nlp.trees.Treebank)7 ArrayList (java.util.ArrayList)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)6 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)6 Properties (java.util.Properties)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 Map (java.util.Map)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)1 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)1 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1