use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class CacheParseHypotheses method main.
/**
* An example of a command line is
* <br>
* java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
* <br>
* java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
* <br>
* java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed 026-270,301-499,600-999
*/
public static void main(String[] args) throws IOException {
String parserModel = null;
String output = null;
List<Pair<String, FileFilter>> treebanks = Generics.newArrayList();
int dvKBest = 200;
int numThreads = 1;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-dvKBest")) {
dvKBest = Integer.valueOf(args[argIndex + 1]);
argIndex += 2;
continue;
}
if (args[argIndex].equalsIgnoreCase("-parser") || args[argIndex].equals("-model")) {
parserModel = args[argIndex + 1];
argIndex += 2;
continue;
}
if (args[argIndex].equalsIgnoreCase("-output")) {
output = args[argIndex + 1];
argIndex += 2;
continue;
}
if (args[argIndex].equalsIgnoreCase("-treebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-treebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
treebanks.add(treebankDescription);
continue;
}
if (args[argIndex].equalsIgnoreCase("-numThreads")) {
numThreads = Integer.valueOf(args[argIndex + 1]);
argIndex += 2;
continue;
}
throw new IllegalArgumentException("Unknown argument " + args[argIndex]);
}
if (parserModel == null) {
throw new IllegalArgumentException("Need to supply a parser model with -model");
}
if (output == null) {
throw new IllegalArgumentException("Need to supply an output filename with -output");
}
if (treebanks.size() == 0) {
throw new IllegalArgumentException("Need to supply a treebank with -treebank");
}
log.info("Writing output to " + output);
log.info("Loading parser model " + parserModel);
log.info("Writing " + dvKBest + " hypothesis trees for each tree");
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel, "-dvKBest", Integer.toString(dvKBest));
CacheParseHypotheses cacher = new CacheParseHypotheses(parser);
TreeTransformer transformer = DVParser.buildTrainTransformer(parser.getOp());
List<Tree> sentences = new ArrayList<>();
for (Pair<String, FileFilter> description : treebanks) {
log.info("Reading trees from " + description.first);
Treebank treebank = parser.getOp().tlpParams.memoryTreebank();
treebank.loadPath(description.first, description.second);
treebank = treebank.transform(transformer);
sentences.addAll(treebank);
}
log.info("Processing " + sentences.size() + " trees");
List<Pair<Tree, byte[]>> cache = Generics.newArrayList();
transformer = new SynchronizedTreeTransformer(transformer);
MulticoreWrapper<Tree, Pair<Tree, byte[]>> wrapper = new MulticoreWrapper<>(numThreads, new CacheProcessor(cacher, parser, dvKBest, transformer));
for (Tree tree : sentences) {
wrapper.put(tree);
while (wrapper.peek()) {
cache.add(wrapper.poll());
if (cache.size() % 10 == 0) {
System.out.println("Processed " + cache.size() + " trees");
}
}
}
wrapper.join();
while (wrapper.peek()) {
cache.add(wrapper.poll());
if (cache.size() % 10 == 0) {
System.out.println("Processed " + cache.size() + " trees");
}
}
System.out.println("Finished processing " + cache.size() + " trees");
IOUtils.writeObjectToFile(cache, output);
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class TaggingEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
String encoding = "UTF-8";
String guessFile = null;
String goldFile = null;
Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
if (opt.getKey() == null)
continue;
if (opt.getKey().equals("-l")) {
Language lang = Language.valueOf(opt.getValue()[0].trim());
tlpp = lang.params;
} else if (opt.getKey().equals("-y")) {
maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
} else if (opt.getKey().equals("-v")) {
VERBOSE = true;
} else if (opt.getKey().equals("-c")) {
TaggingEval.doCatLevelEval = true;
} else if (opt.getKey().equals("-e")) {
encoding = opt.getValue()[0];
} else {
log.info(usage.toString());
System.exit(-1);
}
//Non-option arguments located at key null
String[] rest = argsMap.get(null);
if (rest == null || rest.length < minArgs) {
log.info(usage.toString());
System.exit(-1);
}
goldFile = rest[0];
guessFile = rest[1];
}
tlpp.setInputEncoding(encoding);
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final TaggingEval metric = new TaggingEval("Tagging LP/LR");
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > maxGoldYield) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
final Tree evalGold = tc.transformTree(goldTree);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class UnlabeledAttachmentEval method main.
/**
* Run the Evalb scoring metric on guess/gold input. The default language is English.
*
* @param args
*/
public static void main(String[] args) {
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
String encoding = "UTF-8";
String guessFile = null;
String goldFile = null;
Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
if (opt.getKey() == null)
continue;
if (opt.getKey().equals("-l")) {
Language lang = Language.valueOf(opt.getValue()[0].trim());
tlpp = lang.params;
} else if (opt.getKey().equals("-y")) {
maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
} else if (opt.getKey().equals("-v")) {
VERBOSE = true;
} else if (opt.getKey().equals("-e")) {
encoding = opt.getValue()[0];
} else {
log.info(usage.toString());
System.exit(-1);
}
//Non-option arguments located at key null
String[] rest = argsMap.get(null);
if (rest == null || rest.length < minArgs) {
log.info(usage.toString());
System.exit(-1);
}
goldFile = rest[0];
guessFile = rest[1];
}
tlpp.setInputEncoding(encoding);
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder());
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > maxGoldYield) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
evalGuess.indexLeaves(true);
final Tree evalGold = tc.transformTree(goldTree);
evalGold.indexLeaves(true);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class Evalb method main.
/**
* Run the Evalb scoring metric on guess/gold input. The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optionArgDefs());
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
final TreebankLangParserParams tlpp = language.params;
final int maxGoldYield = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
final boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
final boolean sortByF1 = PropertiesUtils.hasProperty(options, "s");
int worstKTreesToEmit = PropertiesUtils.getInt(options, "s", 0);
PriorityQueue<Triple<Double, Tree, Tree>> queue = sortByF1 ? new PriorityQueue<>(2000, new F1Comparator()) : null;
boolean doCatLevel = PropertiesUtils.getBool(options, "c", false);
String labelRegex = options.getProperty("f", null);
String encoding = options.getProperty("e", "UTF-8");
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != minArgs) {
log.info(usage());
System.exit(-1);
}
String goldFile = parsedArgs[0];
String guessFile = parsedArgs[1];
// Command-line has been parsed. Configure the metric for evaluation.
tlpp.setInputEncoding(encoding);
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final Evalb metric = new Evalb("Evalb LP/LR", true);
final EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > maxGoldYield) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
final Tree evalGold = tc.transformTree(goldTree);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
if (doCatLevel)
evalbCat.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
if (sortByF1)
storeTrees(queue, guessTree, goldTree, metric.getLastF1());
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.println();
if (doCatLevel) {
evalbCat.display(true, pwOut);
pwOut.println();
}
if (sortByF1)
emitSortedTrees(queue, worstKTreesToEmit, guessFile);
pwOut.close();
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class GrammarCoverageChecker method testOnTreebank.
private void testOnTreebank(LexicalizedParser pd, TreebankLangParserParams tlpParams, Treebank testTreebank, String treebankRoot, Index<String> stateIndex) {
Timing.startTime();
TreeTransformer annotator = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
// CDM: Aug 2004: With new implementation of treebank split categories,
// I've hardwired this to load English ones. Otherwise need training data.
// op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters()));
op.trainOptions.splitters = ParentAnnotationStats.getEnglishSplitCategories(treebankRoot);
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
for (Tree goldTree : testTreebank) {
goldTree = annotator.transformTree(goldTree);
// System.out.println("Checking tree: " + goldTree);
for (Tree localTree : goldTree) {
// now try to use the grammar to score this local tree
if (localTree.isLeaf() || localTree.isPreTerminal() || localTree.children().length < 2) {
continue;
}
System.out.println(localTreeToRule(localTree));
double score = computeLocalTreeScore(localTree, stateIndex, pd);
if (score == Double.NEGATIVE_INFINITY) {
// System.out.println(localTreeToRule(localTree));
}
System.out.println("score: " + score);
}
}
}
Aggregations