use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class RHSFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
TregexPattern rootMatch = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
rootMatch = TregexPattern.compile("@" + args[i++]);
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i++]);
}
}
Counter<String> rhsCounter = new ClassicCounter<>();
for (Tree t : tb) {
TregexMatcher m = rootMatch.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
StringBuilder sb = new StringBuilder();
for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
rhsCounter.incrementCount(sb.toString().trim());
}
}
List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
PrintWriter pw = tlpp.pw();
for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
pw.close();
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class TaggingEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
String encoding = "UTF-8";
String guessFile = null;
String goldFile = null;
Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
if (opt.getKey() == null)
continue;
if (opt.getKey().equals("-l")) {
Language lang = Language.valueOf(opt.getValue()[0].trim());
tlpp = lang.params;
} else if (opt.getKey().equals("-y")) {
maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
} else if (opt.getKey().equals("-v")) {
VERBOSE = true;
} else if (opt.getKey().equals("-c")) {
TaggingEval.doCatLevelEval = true;
} else if (opt.getKey().equals("-e")) {
encoding = opt.getValue()[0];
} else {
log.info(usage.toString());
System.exit(-1);
}
//Non-option arguments located at key null
String[] rest = argsMap.get(null);
if (rest == null || rest.length < minArgs) {
log.info(usage.toString());
System.exit(-1);
}
goldFile = rest[0];
guessFile = rest[1];
}
tlpp.setInputEncoding(encoding);
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final TaggingEval metric = new TaggingEval("Tagging LP/LR");
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > maxGoldYield) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
final Tree evalGold = tc.transformTree(goldTree);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class UnlabeledAttachmentEval method main.
/**
* Run the Evalb scoring metric on guess/gold input. The default language is English.
*
* @param args
*/
public static void main(String[] args) {
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
String encoding = "UTF-8";
String guessFile = null;
String goldFile = null;
Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);
for (Map.Entry<String, String[]> opt : argsMap.entrySet()) {
if (opt.getKey() == null)
continue;
if (opt.getKey().equals("-l")) {
Language lang = Language.valueOf(opt.getValue()[0].trim());
tlpp = lang.params;
} else if (opt.getKey().equals("-y")) {
maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());
} else if (opt.getKey().equals("-v")) {
VERBOSE = true;
} else if (opt.getKey().equals("-e")) {
encoding = opt.getValue()[0];
} else {
log.info(usage.toString());
System.exit(-1);
}
//Non-option arguments located at key null
String[] rest = argsMap.get(null);
if (rest == null || rest.length < minArgs) {
log.info(usage.toString());
System.exit(-1);
}
goldFile = rest[0];
guessFile = rest[1];
}
tlpp.setInputEncoding(encoding);
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder());
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > maxGoldYield) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
evalGuess.indexLeaves(true);
final Tree evalGold = tc.transformTree(goldTree);
evalGold.indexLeaves(true);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class FactoredLexicon method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 4) {
System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
System.exit(-1);
}
// Command line options
Language language = Language.valueOf(args[0]);
TreebankLangParserParams tlpp = language.params;
Treebank trainTreebank = tlpp.diskTreebank();
trainTreebank.loadPath(args[2]);
Treebank devTreebank = tlpp.diskTreebank();
devTreebank.loadPath(args[3]);
MorphoFeatureSpecification morphoSpec;
Options options = getOptions(language);
if (language.equals(Language.Arabic)) {
morphoSpec = new ArabicMorphoFeatureSpecification();
String[] languageOptions = { "-arabicFactored" };
tlpp.setOptionFlag(languageOptions, 0);
} else if (language.equals(Language.French)) {
morphoSpec = new FrenchMorphoFeatureSpecification();
String[] languageOptions = { "-frenchFactored" };
tlpp.setOptionFlag(languageOptions, 0);
} else {
throw new UnsupportedOperationException();
}
String featureList = args[1];
String[] features = featureList.trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
System.out.println("Language: " + language.toString());
System.out.println("Features: " + args[1]);
// Create word and tag indices
// Save trees in a collection since the interface requires that....
System.out.print("Loading training trees...");
List<Tree> trainTrees = new ArrayList<>(19000);
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
for (Tree tree : trainTreebank) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
trainTrees.add(tree);
}
System.out.printf("Done! (%d trees)%n", trainTrees.size());
// Setup and train the lexicon.
System.out.print("Collecting sufficient statistics for lexicon...");
FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
lexicon.initializeTraining(trainTrees.size());
lexicon.train(trainTrees, null);
lexicon.finishTraining();
System.out.println("Done!");
trainTrees = null;
// Load the tuning set
System.out.print("Loading tuning set...");
List<FactoredLexiconEvent> tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
System.out.printf("...Done! (%d events)%n", tuningSet.size());
// Print the probabilities that we obtain
// TODO(spenceg): Implement tagging accuracy with FactLex
int nCorrect = 0;
Counter<String> errors = new ClassicCounter<>();
for (FactoredLexiconEvent event : tuningSet) {
Iterator<IntTaggedWord> itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
Counter<Integer> logScores = new ClassicCounter<>();
boolean noRules = true;
int goldTagId = -1;
while (itr.hasNext()) {
noRules = false;
IntTaggedWord iTW = itr.next();
if (iTW.tag() == event.tagId()) {
log.info("GOLD-");
goldTagId = iTW.tag();
}
float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
logScores.incrementCount(iTW.tag(), tagScore);
}
if (noRules) {
System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
} else {
// Score the tagging
int hypTagId = Counters.argmax(logScores);
if (hypTagId == goldTagId) {
++nCorrect;
} else {
String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
errors.incrementCount(goldTag);
}
}
log.info();
}
// Output accuracy
double acc = (double) nCorrect / (double) tuningSet.size();
System.err.printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
log.info("% of errors by type:");
List<String> biggestKeys = new ArrayList<>(errors.keySet());
Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
Counters.normalize(errors);
for (String key : biggestKeys) {
System.err.printf("%s\t%.2f%n", key, errors.getCount(key) * 100.0);
}
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class Evalb method main.
/**
* Run the Evalb scoring metric on guess/gold input. The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optionArgDefs());
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
final TreebankLangParserParams tlpp = language.params;
final int maxGoldYield = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
final boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
final boolean sortByF1 = PropertiesUtils.hasProperty(options, "s");
int worstKTreesToEmit = PropertiesUtils.getInt(options, "s", 0);
PriorityQueue<Triple<Double, Tree, Tree>> queue = sortByF1 ? new PriorityQueue<>(2000, new F1Comparator()) : null;
boolean doCatLevel = PropertiesUtils.getBool(options, "c", false);
String labelRegex = options.getProperty("f", null);
String encoding = options.getProperty("e", "UTF-8");
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != minArgs) {
log.info(usage());
System.exit(-1);
}
String goldFile = parsedArgs[0];
String guessFile = parsedArgs[1];
// Command-line has been parsed. Configure the metric for evaluation.
tlpp.setInputEncoding(encoding);
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final Evalb metric = new Evalb("Evalb LP/LR", true);
final EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > maxGoldYield) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
final Tree evalGold = tc.transformTree(goldTree);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
if (doCatLevel)
evalbCat.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
if (sortByF1)
storeTrees(queue, guessTree, goldTree, metric.getLastF1());
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.println();
if (doCatLevel) {
evalbCat.display(true, pwOut);
pwOut.println();
}
if (sortByF1)
emitSortedTrees(queue, worstKTreesToEmit, guessFile);
pwOut.close();
}
Aggregations