Search in sources :

Example 1 with StringLabelFactory

use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.

the class ParentAnnotationStats method main.

/**
 * Calculate parent annotation statistics suitable for doing
 * selective parent splitting in the PCFGParser inside
 * FactoredParser.  <p>
 * Usage: java edu.stanford.nlp.parser.lexparser.ParentAnnotationStats
 * [-tags] treebankPath
 *
 * @param args One argument: path to the Treebank
 */
public static void main(String[] args) {
    boolean doTags = false;
    if (args.length < 1) {
        System.out.println("Usage: java edu.stanford.nlp.parser.lexparser.ParentAnnotationStats [-tags] treebankPath");
    } else {
        int i = 0;
        boolean useCutOff = false;
        double cutOff = 0.0;
        while (args[i].startsWith("-")) {
            if (args[i].equals("-tags")) {
                doTags = true;
                i++;
            } else if (args[i].equals("-cutOff") && i + 1 < args.length) {
                useCutOff = true;
                cutOff = Double.parseDouble(args[i + 1]);
                i += 2;
            } else {
                log.info("Unknown option: " + args[i]);
                i++;
            }
        }
        Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new BobChrisTreeNormalizer()));
        treebank.loadPath(args[i]);
        if (useCutOff) {
            Set<String> splitters = getSplitCategories(treebank, doTags, 0, cutOff, cutOff, null);
            System.out.println(splitters);
        } else {
            ParentAnnotationStats pas = new ParentAnnotationStats(null, doTags);
            treebank.apply(pas);
            pas.printStats();
        }
    }
}
Also used : StringLabelFactory(edu.stanford.nlp.ling.StringLabelFactory)

Example 2 with StringLabelFactory

use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.

the class SisterAnnotationStats method main.

/**
 * Calculate sister annotation statistics suitable for doing
 * selective sister splitting in the PCFGParser inside the
 * FactoredParser.
 *
 * @param args One argument: path to the Treebank
 */
public static void main(String[] args) {
    ClassicCounter<String> c = new ClassicCounter<>();
    c.setCount("A", 0);
    c.setCount("B", 1);
    double d = Counters.klDivergence(c, c);
    System.out.println("KL Divergence: " + d);
    String encoding = "UTF-8";
    if (args.length > 1) {
        encoding = args[1];
    }
    if (args.length < 1) {
        System.out.println("Usage: ParentAnnotationStats treebankPath");
    } else {
        SisterAnnotationStats pas = new SisterAnnotationStats();
        Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new BobChrisTreeNormalizer()), encoding);
        treebank.loadPath(args[0]);
        treebank.apply(pas);
        pas.printStats();
    }
}
Also used : StringLabelFactory(edu.stanford.nlp.ling.StringLabelFactory) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Example 3 with StringLabelFactory

use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.

the class TregexPattern method main.

/**
 * Prints out all matches of a tree pattern on each tree in the path. Usage:
 *
 * {@code
 * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h <node-name>]]* pattern filepath
 * }
 *
 * Arguments:
 *
 * <ul>
 * <li>{@code pattern}: the tree
 * pattern which optionally names some set of nodes (i.e., gives it the "handle") {@code =name} (for some arbitrary
 * string "name")
 * <li> {@code filepath}: the path to files with trees. If this is a directory, there will be recursive descent and the pattern will be run on all files beneath the specified directory.
 * </ul>
 *
 * Options:
 *
 * <ul>
 * <li> {@code -C} suppresses printing of matches, so only the
 * number of matches is printed.
 * <li> {@code -w} causes ONLY the whole of a tree that matches to be printed.
 * <li> {@code -W} causes the whole of a tree that matches to be printed ALSO.
 * <li> {@code -f} causes the filename to be printed.
 * <li> {@code -i <filename>} causes the pattern to be matched to be read from {@code <filename>} rather than the command line.  Don't specify a pattern when this option is used.
 * <li> {@code -o} Specifies that each tree node can be reported only once as the root of a match (by default a node will
 * be printed once for every <em>way</em> the pattern matches).
 * <li> {@code -s} causes trees to be printed all on one line (by default they are pretty printed).
 * <li> {@code -n} causes the number of the tree in which the match was found to be
 * printed before every match.
 * <li> {@code -u} causes only the label of each matching node to be printed, not complete subtrees.
 * <li> {@code -t} causes only the yield (terminal words) of the selected node to be printed (or the yield of the whole tree, if the {@code -w} option is used).
 * <li> {@code -encoding <charset_encoding>} option allows specification of character encoding of trees..
 * <li> {@code -h <node-handle>} If a {@code -h} option is given, the root tree node will not be printed.  Instead,
 * for each {@code node-handle} specified, the node matched and given that handle will be printed.  Multiple nodes can be printed by using the
 * {@code -h} option multiple times on a single command line.
 * <li> {@code -hf <headfinder-class-name>} use the specified {@link HeadFinder} class to determine headship relations.
 * <li> {@code -hfArg <string>} pass a string argument in to the {@link HeadFinder} class's constructor.  {@code -hfArg} can be used multiple times to pass in multiple arguments.
 * <li> {@code -trf <TreeReaderFactory-class-name>} use the specified {@link TreeReaderFactory} class to read trees from files.
 * <li> {@code -e <extension>} Only attempt to read files with the given extension. If not provided, will attempt to read all files.</li>
 * <li> {@code -v} print every tree that contains no matches of the specified pattern, but print no matches to the pattern.
 *
 * <li> {@code -x} Instead of the matched subtree, print the matched subtree's identifying number as defined in <tt>tgrep2</tt>:a
 * unique identifier for the subtree and is in the form s:n, where s is an integer specifying
 * the sentence number in the corpus (starting with 1), and n is an integer giving the order
 * in which the node is encountered in a depth-first search starting with 1 at top node in the
 * sentence tree.
 *
 * <li> {@code -extract <tree-file>} extracts the subtree s:n specified by <tt>code</tt> from the specified <tt>tree-file</tt>.
 *     Overrides all other behavior of tregex.  Can't specify multiple encodings etc. yet.
 * <li> {@code -extractFile <code-file> <tree-file>} extracts every subtree specified by the subtree codes in
 *     {@code code-file}, which must appear exactly one per line, from the specified {@code tree-file}.
 *     Overrides all other behavior of tregex. Can't specify multiple encodings etc. yet.
 * <li> {@code -filter} causes this to act as a filter, reading tree input from stdin
 * <li> {@code -T} causes all trees to be printed as processed (for debugging purposes).  Otherwise only matching nodes are printed.
 * <li> {@code -macros <filename>} filename with macro substitutions to use.  file with tab separated lines original-tab-replacement
 * </ul>
 */
public static void main(String[] args) throws IOException {
    Timing.startTime();
    StringBuilder treePrintFormats = new StringBuilder();
    String printNonMatchingTreesOption = "-v";
    String subtreeCodeOption = "-x";
    String extractSubtreesOption = "-extract";
    String extractSubtreesFileOption = "-extractFile";
    String inputFileOption = "-i";
    String headFinderOption = "-hf";
    String headFinderArgOption = "-hfArg";
    String trfOption = "-trf";
    String extensionOption = "-e";
    String extension = null;
    String headFinderClassName = null;
    String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY;
    String treeReaderFactoryClassName = null;
    String printHandleOption = "-h";
    String markHandleOption = "-k";
    String encodingOption = "-encoding";
    String encoding = "UTF-8";
    String macroOption = "-macros";
    String macroFilename = "";
    String yieldOnly = "-t";
    String printAllTrees = "-T";
    String quietMode = "-C";
    String wholeTreeOnlyMode = "-w";
    String wholeTreeAlsoMode = "-W";
    String filenameOption = "-f";
    String oneMatchPerRootNodeMode = "-o";
    String reportTreeNumbers = "-n";
    String rootLabelOnly = "-u";
    String oneLine = "-s";
    String uniqueTrees = "-q";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(extractSubtreesOption, 2);
    flagMap.put(extractSubtreesFileOption, 2);
    flagMap.put(subtreeCodeOption, 0);
    flagMap.put(printNonMatchingTreesOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(inputFileOption, 1);
    flagMap.put(printHandleOption, 1);
    flagMap.put(markHandleOption, 2);
    flagMap.put(headFinderOption, 1);
    flagMap.put(headFinderArgOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(extensionOption, 1);
    flagMap.put(macroOption, 1);
    flagMap.put(yieldOnly, 0);
    flagMap.put(quietMode, 0);
    flagMap.put(wholeTreeOnlyMode, 0);
    flagMap.put(wholeTreeAlsoMode, 0);
    flagMap.put(printAllTrees, 0);
    flagMap.put(filenameOption, 0);
    flagMap.put(oneMatchPerRootNodeMode, 0);
    flagMap.put(reportTreeNumbers, 0);
    flagMap.put(rootLabelOnly, 0);
    flagMap.put(oneLine, 0);
    flagMap.put(uniqueTrees, 0);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);
    if (argsMap.containsKey(encodingOption)) {
        encoding = argsMap.get(encodingOption)[0];
        log.info("Encoding set to " + encoding);
    }
    PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true);
    if (argsMap.containsKey(extractSubtreesOption)) {
        List<String> subTreeStrings = Collections.singletonList(argsMap.get(extractSubtreesOption)[0]);
        extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]);
        return;
    }
    if (argsMap.containsKey(extractSubtreesFileOption)) {
        List<String> subTreeStrings = Arrays.asList(IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r"));
        extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]);
        return;
    }
    if (args.length < 1) {
        errPW.println("Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-W] [-f] [-o] [-n] [-s] [-filter]  [-hf class] [-trf class] [-h handle]* [-e ext] pattern [filepath]");
        return;
    }
    String matchString = args[0];
    if (argsMap.containsKey(macroOption)) {
        macroFilename = argsMap.get(macroOption)[0];
    }
    if (argsMap.containsKey(headFinderOption)) {
        headFinderClassName = argsMap.get(headFinderOption)[0];
        errPW.println("Using head finder " + headFinderClassName + "...");
    }
    if (argsMap.containsKey(headFinderArgOption)) {
        headFinderArgs = argsMap.get(headFinderArgOption);
    }
    if (argsMap.containsKey(trfOption)) {
        treeReaderFactoryClassName = argsMap.get(trfOption)[0];
        errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "...");
    }
    if (argsMap.containsKey(extensionOption)) {
        extension = argsMap.get(extensionOption)[0];
    }
    if (argsMap.containsKey(printAllTrees)) {
        TRegexTreeVisitor.printTree = true;
    }
    if (argsMap.containsKey(inputFileOption)) {
        String inputFile = argsMap.get(inputFileOption)[0];
        matchString = IOUtils.slurpFile(inputFile, encoding);
        String[] newArgs = new String[args.length + 1];
        System.arraycopy(args, 0, newArgs, 1, args.length);
        args = newArgs;
    }
    if (argsMap.containsKey(quietMode)) {
        TRegexTreeVisitor.printMatches = false;
        TRegexTreeVisitor.printNumMatchesToStdOut = true;
    }
    if (argsMap.containsKey(printNonMatchingTreesOption)) {
        TRegexTreeVisitor.printNonMatchingTrees = true;
    }
    if (argsMap.containsKey(subtreeCodeOption)) {
        TRegexTreeVisitor.printSubtreeCode = true;
        TRegexTreeVisitor.printMatches = false;
    }
    if (argsMap.containsKey(wholeTreeOnlyMode)) {
        TRegexTreeVisitor.printWholeTreeOnly = true;
    }
    if (argsMap.containsKey(wholeTreeAlsoMode)) {
        TRegexTreeVisitor.printWholeTreeAlso = true;
    }
    if (argsMap.containsKey(filenameOption)) {
        TRegexTreeVisitor.printFilename = true;
    }
    if (argsMap.containsKey(oneMatchPerRootNodeMode))
        TRegexTreeVisitor.oneMatchPerRootNode = true;
    if (argsMap.containsKey(reportTreeNumbers))
        TRegexTreeVisitor.reportTreeNumbers = true;
    if (argsMap.containsKey(rootLabelOnly)) {
        treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(',');
    } else if (argsMap.containsKey(oneLine)) {
        // display short form
        treePrintFormats.append("oneline,");
    } else if (argsMap.containsKey(yieldOnly)) {
        treePrintFormats.append("words,");
    } else {
        treePrintFormats.append("penn,");
    }
    if (argsMap.containsKey(uniqueTrees)) {
        TRegexTreeVisitor.printOnlyUniqueTrees = true;
    }
    HeadFinder hf = new CollinsHeadFinder();
    if (headFinderClassName != null) {
        Class[] hfArgClasses = new Class[headFinderArgs.length];
        for (int i = 0; i < hfArgClasses.length; i++) {
            hfArgClasses[i] = String.class;
        }
        try {
            // cast to Object[] necessary to avoid varargs-related warning.
            hf = (HeadFinder) Class.forName(headFinderClassName).getConstructor(hfArgClasses).newInstance((Object[]) headFinderArgs);
        } catch (Exception e) {
            throw new RuntimeException("Error occurred while constructing HeadFinder: " + e);
        }
    }
    TRegexTreeVisitor.tp = new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack());
    try {
        // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``");
        TregexPatternCompiler tpc = new TregexPatternCompiler(hf);
        Macros.addAllMacros(tpc, macroFilename, encoding);
        TregexPattern p = tpc.compile(matchString);
        errPW.println("Pattern string:\n" + p.pattern());
        errPW.println("Parsed representation:");
        p.prettyPrint(errPW);
        String[] handles = argsMap.get(printHandleOption);
        if (argsMap.containsKey("-filter")) {
            TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
            // has to be in memory since we're not storing it on disk
            treebank = new MemoryTreebank(trf, encoding);
            // read from stdin
            Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding));
            ((MemoryTreebank) treebank).load(reader);
            reader.close();
        } else if (args.length == 1) {
            errPW.println("using default tree");
            TreeReader r = new PennTreeReader(new StringReader("(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"), new LabeledScoredTreeFactory(new StringLabelFactory()));
            Tree t = r.readTree();
            treebank = new MemoryTreebank();
            treebank.add(t);
        } else {
            int last = args.length - 1;
            errPW.println("Reading trees from file(s) " + args[last]);
            TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
            treebank = new DiskTreebank(trf, encoding);
            treebank.loadPath(args[last], extension, true);
        }
        TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding);
        treebank.apply(vis);
        Timing.endTime();
        if (TRegexTreeVisitor.printMatches) {
            errPW.println("There were " + vis.numMatches() + " matches in total.");
        }
        if (TRegexTreeVisitor.printNumMatchesToStdOut) {
            System.out.println(vis.numMatches());
        }
    } catch (IOException e) {
        log.warn(e);
    } catch (TregexParseException e) {
        errPW.println("Error parsing expression: " + args[0]);
        errPW.println("Parse exception: " + e);
    }
}
Also used : StringLabelFactory(edu.stanford.nlp.ling.StringLabelFactory)

Example 4 with StringLabelFactory

use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.

the class DependencyIndexITest method testPositions.

@Test
public void testPositions() {
    try {
        // System.err.println();
        // System.err.println("One.");
        // check a tree loaded from a reader, using StringLabelFactory
        Tree tree = (new PennTreeReader(new StringReader("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))"), new LabeledScoredTreeFactory(new StringLabelFactory()))).readTree();
        // System.out.println(tree.pennString());
        checkTree(tree);
        // System.err.println("Two.");
        // check a tree created using Tree.valueOf()
        tree = Tree.valueOf("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))");
        // System.out.println(tree.pennString());
        checkTree(tree);
        // System.err.println("Three.");
        // check a tree loaded from a reader, using CoreLabelFactory
        tree = (new PennTreeReader(new StringReader("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))"), new LabeledScoredTreeFactory(CoreLabel.factory()))).readTree();
        // System.out.println(tree.pennString());
        checkTree(tree);
        // System.err.println("Four.");
        // check a tree generated by the parser
        LexicalizedParser parser = LexicalizedParser.loadModel();
        tree = parser.parse("Mary had a little lamb .");
        // System.out.println(tree.pennString());
        tree.indexLeaves();
        checkTree(tree);
    } catch (IOException e) {
        // this should never happen
        fail("IOException shouldn't happen.");
    }
}
Also used : PennTreeReader(edu.stanford.nlp.trees.PennTreeReader) StringLabelFactory(edu.stanford.nlp.ling.StringLabelFactory) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) IOException(java.io.IOException) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) Test(org.junit.Test)

Example 5 with StringLabelFactory

use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.

the class TreeAnnotatorAndBinarizer method transformTree.

/**
 * The tree t is normally expected to be a Penn-Treebank-style tree
 *  in which the top node is an extra node that has a unary expansion.
 *  If this isn't the case, an extra node is added and the user is warned.
 */
@Override
public Tree transformTree(Tree t) {
    if (trainOptions.printTreeTransformations > 0) {
        trainOptions.printTrainTree(null, "ORIGINAL TREE:", t);
    }
    Tree trTree = annotator.transformTree(t);
    if (trainOptions.selectivePostSplit) {
        trTree = postSplitter.transformTree(trTree);
    }
    if (trainOptions.printTreeTransformations > 0) {
        trainOptions.printTrainTree(trainOptions.printAnnotatedPW, "ANNOTATED TREE:", trTree);
    }
    if (trainOptions.printAnnotatedRuleCounts) {
        Tree tr2 = trTree.deepCopy(new LabeledScoredTreeFactory(), new StringLabelFactory());
        Set<Tree> localTrees = tr2.localTrees();
        for (Tree tr : localTrees) {
            annotatedRuleCounts.incrementCount(tr);
        }
    }
    if (trainOptions.printAnnotatedStateCounts) {
        for (Tree subt : trTree) {
            if (!subt.isLeaf()) {
                annotatedStateCounts.incrementCount(subt.label().value());
            }
        }
    }
    // if we add the ROOT first, then we don't know how to percolate the heads at the top
    // this creates a few non-binarized rules at the top
    addRoot(trTree);
    Tree binarizedTree = binarizer.transformTree(trTree);
    if (trainOptions.printTreeTransformations > 0) {
        trainOptions.printTrainTree(trainOptions.printBinarizedPW, "BINARIZED TREE:", binarizedTree);
        trainOptions.printTreeTransformations--;
    }
    if (forceCNF) {
        binarizedTree = new CNFTransformers.ToCNFTransformer().transformTree(binarizedTree);
    // System.out.println("BinarizedCNF:\n");
    // binarizedTree.pennPrint();
    }
    return binarizedTree;
}
Also used : StringLabelFactory(edu.stanford.nlp.ling.StringLabelFactory)

Aggregations

StringLabelFactory (edu.stanford.nlp.ling.StringLabelFactory)7 StringReader (java.io.StringReader)3 LabeledScoredTreeFactory (edu.stanford.nlp.trees.LabeledScoredTreeFactory)2 PennTreeReader (edu.stanford.nlp.trees.PennTreeReader)2 Tree (edu.stanford.nlp.trees.Tree)2 IOException (java.io.IOException)2 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)1 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)1 WindowAdapter (java.awt.event.WindowAdapter)1 WindowEvent (java.awt.event.WindowEvent)1 Test (org.junit.Test)1