use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.
the class ParentAnnotationStats method main.
/**
* Calculate parent annotation statistics suitable for doing
* selective parent splitting in the PCFGParser inside
* FactoredParser. <p>
* Usage: java edu.stanford.nlp.parser.lexparser.ParentAnnotationStats
* [-tags] treebankPath
*
* @param args One argument: path to the Treebank
*/
public static void main(String[] args) {
boolean doTags = false;
if (args.length < 1) {
System.out.println("Usage: java edu.stanford.nlp.parser.lexparser.ParentAnnotationStats [-tags] treebankPath");
} else {
int i = 0;
boolean useCutOff = false;
double cutOff = 0.0;
while (args[i].startsWith("-")) {
if (args[i].equals("-tags")) {
doTags = true;
i++;
} else if (args[i].equals("-cutOff") && i + 1 < args.length) {
useCutOff = true;
cutOff = Double.parseDouble(args[i + 1]);
i += 2;
} else {
log.info("Unknown option: " + args[i]);
i++;
}
}
Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new BobChrisTreeNormalizer()));
treebank.loadPath(args[i]);
if (useCutOff) {
Set<String> splitters = getSplitCategories(treebank, doTags, 0, cutOff, cutOff, null);
System.out.println(splitters);
} else {
ParentAnnotationStats pas = new ParentAnnotationStats(null, doTags);
treebank.apply(pas);
pas.printStats();
}
}
}
use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.
the class SisterAnnotationStats method main.
/**
* Calculate sister annotation statistics suitable for doing
* selective sister splitting in the PCFGParser inside the
* FactoredParser.
*
* @param args One argument: path to the Treebank
*/
public static void main(String[] args) {
ClassicCounter<String> c = new ClassicCounter<>();
c.setCount("A", 0);
c.setCount("B", 1);
double d = Counters.klDivergence(c, c);
System.out.println("KL Divergence: " + d);
String encoding = "UTF-8";
if (args.length > 1) {
encoding = args[1];
}
if (args.length < 1) {
System.out.println("Usage: ParentAnnotationStats treebankPath");
} else {
SisterAnnotationStats pas = new SisterAnnotationStats();
Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new BobChrisTreeNormalizer()), encoding);
treebank.loadPath(args[0]);
treebank.apply(pas);
pas.printStats();
}
}
use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.
the class TregexPattern method main.
/**
* Prints out all matches of a tree pattern on each tree in the path. Usage:
*
* {@code
* java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h <node-name>]]* pattern filepath
* }
*
* Arguments:
*
* <ul>
* <li>{@code pattern}: the tree
* pattern which optionally names some set of nodes (i.e., gives it the "handle") {@code =name} (for some arbitrary
* string "name")
* <li> {@code filepath}: the path to files with trees. If this is a directory, there will be recursive descent and the pattern will be run on all files beneath the specified directory.
* </ul>
*
* Options:
*
* <ul>
* <li> {@code -C} suppresses printing of matches, so only the
* number of matches is printed.
* <li> {@code -w} causes ONLY the whole of a tree that matches to be printed.
* <li> {@code -W} causes the whole of a tree that matches to be printed ALSO.
* <li> {@code -f} causes the filename to be printed.
* <li> {@code -i <filename>} causes the pattern to be matched to be read from {@code <filename>} rather than the command line. Don't specify a pattern when this option is used.
* <li> {@code -o} Specifies that each tree node can be reported only once as the root of a match (by default a node will
* be printed once for every <em>way</em> the pattern matches).
* <li> {@code -s} causes trees to be printed all on one line (by default they are pretty printed).
* <li> {@code -n} causes the number of the tree in which the match was found to be
* printed before every match.
* <li> {@code -u} causes only the label of each matching node to be printed, not complete subtrees.
* <li> {@code -t} causes only the yield (terminal words) of the selected node to be printed (or the yield of the whole tree, if the {@code -w} option is used).
* <li> {@code -encoding <charset_encoding>} option allows specification of character encoding of trees..
* <li> {@code -h <node-handle>} If a {@code -h} option is given, the root tree node will not be printed. Instead,
* for each {@code node-handle} specified, the node matched and given that handle will be printed. Multiple nodes can be printed by using the
* {@code -h} option multiple times on a single command line.
* <li> {@code -hf <headfinder-class-name>} use the specified {@link HeadFinder} class to determine headship relations.
* <li> {@code -hfArg <string>} pass a string argument in to the {@link HeadFinder} class's constructor. {@code -hfArg} can be used multiple times to pass in multiple arguments.
* <li> {@code -trf <TreeReaderFactory-class-name>} use the specified {@link TreeReaderFactory} class to read trees from files.
* <li> {@code -e <extension>} Only attempt to read files with the given extension. If not provided, will attempt to read all files.</li>
* <li> {@code -v} print every tree that contains no matches of the specified pattern, but print no matches to the pattern.
*
* <li> {@code -x} Instead of the matched subtree, print the matched subtree's identifying number as defined in <tt>tgrep2</tt>:a
* unique identifier for the subtree and is in the form s:n, where s is an integer specifying
* the sentence number in the corpus (starting with 1), and n is an integer giving the order
* in which the node is encountered in a depth-first search starting with 1 at top node in the
* sentence tree.
*
* <li> {@code -extract <tree-file>} extracts the subtree s:n specified by <tt>code</tt> from the specified <tt>tree-file</tt>.
* Overrides all other behavior of tregex. Can't specify multiple encodings etc. yet.
* <li> {@code -extractFile <code-file> <tree-file>} extracts every subtree specified by the subtree codes in
* {@code code-file}, which must appear exactly one per line, from the specified {@code tree-file}.
* Overrides all other behavior of tregex. Can't specify multiple encodings etc. yet.
* <li> {@code -filter} causes this to act as a filter, reading tree input from stdin
* <li> {@code -T} causes all trees to be printed as processed (for debugging purposes). Otherwise only matching nodes are printed.
* <li> {@code -macros <filename>} filename with macro substitutions to use. file with tab separated lines original-tab-replacement
* </ul>
*/
public static void main(String[] args) throws IOException {
Timing.startTime();
StringBuilder treePrintFormats = new StringBuilder();
String printNonMatchingTreesOption = "-v";
String subtreeCodeOption = "-x";
String extractSubtreesOption = "-extract";
String extractSubtreesFileOption = "-extractFile";
String inputFileOption = "-i";
String headFinderOption = "-hf";
String headFinderArgOption = "-hfArg";
String trfOption = "-trf";
String extensionOption = "-e";
String extension = null;
String headFinderClassName = null;
String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY;
String treeReaderFactoryClassName = null;
String printHandleOption = "-h";
String markHandleOption = "-k";
String encodingOption = "-encoding";
String encoding = "UTF-8";
String macroOption = "-macros";
String macroFilename = "";
String yieldOnly = "-t";
String printAllTrees = "-T";
String quietMode = "-C";
String wholeTreeOnlyMode = "-w";
String wholeTreeAlsoMode = "-W";
String filenameOption = "-f";
String oneMatchPerRootNodeMode = "-o";
String reportTreeNumbers = "-n";
String rootLabelOnly = "-u";
String oneLine = "-s";
String uniqueTrees = "-q";
Map<String, Integer> flagMap = Generics.newHashMap();
flagMap.put(extractSubtreesOption, 2);
flagMap.put(extractSubtreesFileOption, 2);
flagMap.put(subtreeCodeOption, 0);
flagMap.put(printNonMatchingTreesOption, 0);
flagMap.put(encodingOption, 1);
flagMap.put(inputFileOption, 1);
flagMap.put(printHandleOption, 1);
flagMap.put(markHandleOption, 2);
flagMap.put(headFinderOption, 1);
flagMap.put(headFinderArgOption, 1);
flagMap.put(trfOption, 1);
flagMap.put(extensionOption, 1);
flagMap.put(macroOption, 1);
flagMap.put(yieldOnly, 0);
flagMap.put(quietMode, 0);
flagMap.put(wholeTreeOnlyMode, 0);
flagMap.put(wholeTreeAlsoMode, 0);
flagMap.put(printAllTrees, 0);
flagMap.put(filenameOption, 0);
flagMap.put(oneMatchPerRootNodeMode, 0);
flagMap.put(reportTreeNumbers, 0);
flagMap.put(rootLabelOnly, 0);
flagMap.put(oneLine, 0);
flagMap.put(uniqueTrees, 0);
Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
args = argsMap.get(null);
if (argsMap.containsKey(encodingOption)) {
encoding = argsMap.get(encodingOption)[0];
log.info("Encoding set to " + encoding);
}
PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true);
if (argsMap.containsKey(extractSubtreesOption)) {
List<String> subTreeStrings = Collections.singletonList(argsMap.get(extractSubtreesOption)[0]);
extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]);
return;
}
if (argsMap.containsKey(extractSubtreesFileOption)) {
List<String> subTreeStrings = Arrays.asList(IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r"));
extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]);
return;
}
if (args.length < 1) {
errPW.println("Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-W] [-f] [-o] [-n] [-s] [-filter] [-hf class] [-trf class] [-h handle]* [-e ext] pattern [filepath]");
return;
}
String matchString = args[0];
if (argsMap.containsKey(macroOption)) {
macroFilename = argsMap.get(macroOption)[0];
}
if (argsMap.containsKey(headFinderOption)) {
headFinderClassName = argsMap.get(headFinderOption)[0];
errPW.println("Using head finder " + headFinderClassName + "...");
}
if (argsMap.containsKey(headFinderArgOption)) {
headFinderArgs = argsMap.get(headFinderArgOption);
}
if (argsMap.containsKey(trfOption)) {
treeReaderFactoryClassName = argsMap.get(trfOption)[0];
errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "...");
}
if (argsMap.containsKey(extensionOption)) {
extension = argsMap.get(extensionOption)[0];
}
if (argsMap.containsKey(printAllTrees)) {
TRegexTreeVisitor.printTree = true;
}
if (argsMap.containsKey(inputFileOption)) {
String inputFile = argsMap.get(inputFileOption)[0];
matchString = IOUtils.slurpFile(inputFile, encoding);
String[] newArgs = new String[args.length + 1];
System.arraycopy(args, 0, newArgs, 1, args.length);
args = newArgs;
}
if (argsMap.containsKey(quietMode)) {
TRegexTreeVisitor.printMatches = false;
TRegexTreeVisitor.printNumMatchesToStdOut = true;
}
if (argsMap.containsKey(printNonMatchingTreesOption)) {
TRegexTreeVisitor.printNonMatchingTrees = true;
}
if (argsMap.containsKey(subtreeCodeOption)) {
TRegexTreeVisitor.printSubtreeCode = true;
TRegexTreeVisitor.printMatches = false;
}
if (argsMap.containsKey(wholeTreeOnlyMode)) {
TRegexTreeVisitor.printWholeTreeOnly = true;
}
if (argsMap.containsKey(wholeTreeAlsoMode)) {
TRegexTreeVisitor.printWholeTreeAlso = true;
}
if (argsMap.containsKey(filenameOption)) {
TRegexTreeVisitor.printFilename = true;
}
if (argsMap.containsKey(oneMatchPerRootNodeMode))
TRegexTreeVisitor.oneMatchPerRootNode = true;
if (argsMap.containsKey(reportTreeNumbers))
TRegexTreeVisitor.reportTreeNumbers = true;
if (argsMap.containsKey(rootLabelOnly)) {
treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(',');
} else if (argsMap.containsKey(oneLine)) {
// display short form
treePrintFormats.append("oneline,");
} else if (argsMap.containsKey(yieldOnly)) {
treePrintFormats.append("words,");
} else {
treePrintFormats.append("penn,");
}
if (argsMap.containsKey(uniqueTrees)) {
TRegexTreeVisitor.printOnlyUniqueTrees = true;
}
HeadFinder hf = new CollinsHeadFinder();
if (headFinderClassName != null) {
Class[] hfArgClasses = new Class[headFinderArgs.length];
for (int i = 0; i < hfArgClasses.length; i++) {
hfArgClasses[i] = String.class;
}
try {
// cast to Object[] necessary to avoid varargs-related warning.
hf = (HeadFinder) Class.forName(headFinderClassName).getConstructor(hfArgClasses).newInstance((Object[]) headFinderArgs);
} catch (Exception e) {
throw new RuntimeException("Error occurred while constructing HeadFinder: " + e);
}
}
TRegexTreeVisitor.tp = new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack());
try {
// TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``");
TregexPatternCompiler tpc = new TregexPatternCompiler(hf);
Macros.addAllMacros(tpc, macroFilename, encoding);
TregexPattern p = tpc.compile(matchString);
errPW.println("Pattern string:\n" + p.pattern());
errPW.println("Parsed representation:");
p.prettyPrint(errPW);
String[] handles = argsMap.get(printHandleOption);
if (argsMap.containsKey("-filter")) {
TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
// has to be in memory since we're not storing it on disk
treebank = new MemoryTreebank(trf, encoding);
// read from stdin
Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding));
((MemoryTreebank) treebank).load(reader);
reader.close();
} else if (args.length == 1) {
errPW.println("using default tree");
TreeReader r = new PennTreeReader(new StringReader("(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"), new LabeledScoredTreeFactory(new StringLabelFactory()));
Tree t = r.readTree();
treebank = new MemoryTreebank();
treebank.add(t);
} else {
int last = args.length - 1;
errPW.println("Reading trees from file(s) " + args[last]);
TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
treebank = new DiskTreebank(trf, encoding);
treebank.loadPath(args[last], extension, true);
}
TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding);
treebank.apply(vis);
Timing.endTime();
if (TRegexTreeVisitor.printMatches) {
errPW.println("There were " + vis.numMatches() + " matches in total.");
}
if (TRegexTreeVisitor.printNumMatchesToStdOut) {
System.out.println(vis.numMatches());
}
} catch (IOException e) {
log.warn(e);
} catch (TregexParseException e) {
errPW.println("Error parsing expression: " + args[0]);
errPW.println("Parse exception: " + e);
}
}
use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.
the class DependencyIndexITest method testPositions.
@Test
public void testPositions() {
try {
// System.err.println();
// System.err.println("One.");
// check a tree loaded from a reader, using StringLabelFactory
Tree tree = (new PennTreeReader(new StringReader("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))"), new LabeledScoredTreeFactory(new StringLabelFactory()))).readTree();
// System.out.println(tree.pennString());
checkTree(tree);
// System.err.println("Two.");
// check a tree created using Tree.valueOf()
tree = Tree.valueOf("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))");
// System.out.println(tree.pennString());
checkTree(tree);
// System.err.println("Three.");
// check a tree loaded from a reader, using CoreLabelFactory
tree = (new PennTreeReader(new StringReader("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))"), new LabeledScoredTreeFactory(CoreLabel.factory()))).readTree();
// System.out.println(tree.pennString());
checkTree(tree);
// System.err.println("Four.");
// check a tree generated by the parser
LexicalizedParser parser = LexicalizedParser.loadModel();
tree = parser.parse("Mary had a little lamb .");
// System.out.println(tree.pennString());
tree.indexLeaves();
checkTree(tree);
} catch (IOException e) {
// this should never happen
fail("IOException shouldn't happen.");
}
}
use of edu.stanford.nlp.ling.StringLabelFactory in project CoreNLP by stanfordnlp.
the class TreeAnnotatorAndBinarizer method transformTree.
/**
* The tree t is normally expected to be a Penn-Treebank-style tree
* in which the top node is an extra node that has a unary expansion.
* If this isn't the case, an extra node is added and the user is warned.
*/
@Override
public Tree transformTree(Tree t) {
if (trainOptions.printTreeTransformations > 0) {
trainOptions.printTrainTree(null, "ORIGINAL TREE:", t);
}
Tree trTree = annotator.transformTree(t);
if (trainOptions.selectivePostSplit) {
trTree = postSplitter.transformTree(trTree);
}
if (trainOptions.printTreeTransformations > 0) {
trainOptions.printTrainTree(trainOptions.printAnnotatedPW, "ANNOTATED TREE:", trTree);
}
if (trainOptions.printAnnotatedRuleCounts) {
Tree tr2 = trTree.deepCopy(new LabeledScoredTreeFactory(), new StringLabelFactory());
Set<Tree> localTrees = tr2.localTrees();
for (Tree tr : localTrees) {
annotatedRuleCounts.incrementCount(tr);
}
}
if (trainOptions.printAnnotatedStateCounts) {
for (Tree subt : trTree) {
if (!subt.isLeaf()) {
annotatedStateCounts.incrementCount(subt.label().value());
}
}
}
// if we add the ROOT first, then we don't know how to percolate the heads at the top
// this creates a few non-binarized rules at the top
addRoot(trTree);
Tree binarizedTree = binarizer.transformTree(trTree);
if (trainOptions.printTreeTransformations > 0) {
trainOptions.printTrainTree(trainOptions.printBinarizedPW, "BINARIZED TREE:", binarizedTree);
trainOptions.printTreeTransformations--;
}
if (forceCNF) {
binarizedTree = new CNFTransformers.ToCNFTransformer().transformTree(binarizedTree);
// System.out.println("BinarizedCNF:\n");
// binarizedTree.pennPrint();
}
return binarizedTree;
}
Aggregations