use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class TsarfatyEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
int maxGuessYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
boolean skipGuess = false;
boolean tagMode = false;
String guessFile = null;
String goldFile = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-y":
maxGoldYield = Integer.parseInt(args[++i].trim());
break;
case "-t":
tagMode = true;
break;
case "-v":
VERBOSE = true;
break;
case "-g":
maxGuessYield = Integer.parseInt(args[++i].trim());
skipGuess = true;
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
//Required parameters
goldFile = args[i++];
guessFile = args[i];
break;
}
}
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
final ArrayList<Label> guessSent = guess.yield();
final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
if (guessSent.size() > maxGuessYield) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
final ArrayList<Label> goldSent = gold.yield();
final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
if (goldSent.size() > maxGoldYield) {
continue;
} else if (goldChars.length() != guessChars.length()) {
pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
eval.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class CollinsDepEval method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length < MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optionArgDefs());
boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
Language LANGUAGE = PropertiesUtils.get(options, "l", Language.English, Language.class);
int MAX_GOLD_YIELD = PropertiesUtils.getInt(options, "g", Integer.MAX_VALUE);
int MAX_GUESS_YIELD = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
File goldFile = new File(parsedArgs[0]);
File guessFile = new File(parsedArgs[1]);
final TreebankLangParserParams tlpp = LANGUAGE.params;
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
if (guess.yield().size() > MAX_GUESS_YIELD) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
if (gold.yield().size() > MAX_GOLD_YIELD) {
continue;
} else if (evalGold.yield().size() != evalGuess.yield().size()) {
pwOut.println("Yield mismatch at gold line " + goldLineId);
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
depEval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((MAX_GUESS_YIELD < Integer.MAX_VALUE) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
depEval.display(true, pwOut);
pwOut.close();
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class TreebankFactoredLexiconStats method main.
// private static String stripTag(String tag) {
// if (tag.startsWith("DT")) {
// String newTag = tag.substring(2, tag.length());
// return newTag.length() > 0 ? newTag : tag;
// }
// return tag;
// }
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
System.exit(-1);
}
Language language = Language.valueOf(args[0]);
TreebankLangParserParams tlpp = language.params;
if (language.equals(Language.Arabic)) {
String[] options = { "-arabicFactored" };
tlpp.setOptionFlag(options, 0);
} else {
String[] options = { "-frenchFactored" };
tlpp.setOptionFlag(options, 0);
}
Treebank tb = tlpp.diskTreebank();
tb.loadPath(args[1]);
MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
String[] features = args[2].trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
// Counters
Counter<String> wordTagCounter = new ClassicCounter<>(30000);
Counter<String> morphTagCounter = new ClassicCounter<>(500);
// Counter<String> signatureTagCounter = new ClassicCounter<String>();
Counter<String> morphCounter = new ClassicCounter<>(500);
Counter<String> wordCounter = new ClassicCounter<>(30000);
Counter<String> tagCounter = new ClassicCounter<>(300);
Counter<String> lemmaCounter = new ClassicCounter<>(25000);
Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);
Counter<String> richTagCounter = new ClassicCounter<>(1000);
Counter<String> reducedTagCounter = new ClassicCounter<>(500);
Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);
Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();
TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000);
TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500);
TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300);
int numTrees = 0;
for (Tree tree : tb) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
List<Label> pretermList = tree.preTerminalYield();
List<Label> yield = tree.yield();
assert yield.size() == pretermList.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
String tag = pretermList.get(i).value();
String word = yield.get(i).value();
String morph = ((CoreLabel) yield.get(i)).originalText();
// Note: if there is no lemma, then we use the surface form.
Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
String lemma = lemmaTag.first();
String richTag = lemmaTag.second();
// WSGDEBUG
if (tag.contains("MW"))
lemma += "-MWE";
lemmaCounter.incrementCount(lemma);
lemmaTagCounter.incrementCount(lemma + tag);
richTagCounter.incrementCount(richTag);
String reducedTag = morphoSpec.strToFeatures(richTag).toString();
reducedTagCounter.incrementCount(reducedTag);
reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
wordTagCounter.incrementCount(word + tag);
morphTagCounter.incrementCount(morph + tag);
morphCounter.incrementCount(morph);
wordCounter.incrementCount(word);
tagCounter.incrementCount(tag);
reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
if (wordLemmaMap.containsKey(word)) {
wordLemmaMap.get(word).add(lemma);
} else {
Set<String> lemmas = Generics.newHashSet(1);
wordLemmaMap.put(word, lemmas);
}
lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
tagReducedTagCounter.incrementCount(tag, reducedTag);
}
++numTrees;
}
// Barf...
System.out.println("Language: " + language.toString());
System.out.printf("#trees:\t%d%n", numTrees);
System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());
// Extra
System.out.println("==================");
StringBuilder sbNoLemma = new StringBuilder();
StringBuilder sbMultLemmas = new StringBuilder();
for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
String word = wordLemmas.getKey();
Set<String> lemmas = wordLemmas.getValue();
if (lemmas.size() == 0) {
sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
continue;
}
if (lemmas.size() > 1) {
sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
continue;
}
String lemma = lemmas.iterator().next();
Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
if (reducedTags.size() > 1) {
System.out.printf("%s --> %s%n", word, lemma);
for (String reducedTag : reducedTags) {
int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
}
System.out.println();
}
}
System.out.println("==================");
System.out.println(sbNoLemma.toString());
System.out.println(sbMultLemmas.toString());
System.out.println("==================");
List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
Collections.sort(tags);
for (String tag : tags) {
System.out.println(tag);
Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
for (String reducedTag : reducedTags) {
int count = tagReducedTagCounter.getCount(tag, reducedTag);
// reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
System.out.printf("\t%s\t%d%n", reducedTag, count);
}
System.out.println();
}
System.out.println("==================");
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class TreebankStats method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length < MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optArgDefs());
String splitPrefix = options.getProperty("s", null);
boolean SHOW_WORDS = PropertiesUtils.getBool(options, "w", false);
boolean pathsAreFiles = PropertiesUtils.getBool(options, "f", false);
boolean SHOW_OOV = PropertiesUtils.getBool(options, "o", false);
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
Language language = Language.valueOf(parsedArgs[0]);
List<String> corpusPaths = new ArrayList<>(parsedArgs.length - 1);
for (int i = 1; i < parsedArgs.length; ++i) {
corpusPaths.add(parsedArgs[i]);
}
TreebankLangParserParams tlpp = language.params;
TreebankStats cs = new TreebankStats(language, corpusPaths, tlpp);
if (splitPrefix != null) {
if (!cs.useSplit(splitPrefix))
log.info("Could not load split!");
}
cs.run(pathsAreFiles, SHOW_WORDS, SHOW_OOV);
}
use of edu.stanford.nlp.international.Language in project CoreNLP by stanfordnlp.
the class RuleBranchingFactor method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage);
System.exit(-1);
}
// Process command-line options
Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
String fileName = options.getProperty("");
if (fileName == null || fileName.equals("")) {
System.out.println(usage);
System.exit(-1);
}
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
TreebankLangParserParams tlpp = language.params;
String encoding = options.getProperty("e", "UTF-8");
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(fileName);
// Statistics
Counter<String> binaryRuleTypes = new ClassicCounter<>(20000);
List<Integer> branchingFactors = new ArrayList<>(20000);
int nTrees = 0;
int nUnaryRules = 0;
int nBinaryRules = 0;
int binaryBranchingFactors = 0;
// Read the treebank
PrintWriter pw = tlpp.pw();
for (Tree tree : tb) {
if (tree.value().equals("ROOT")) {
tree = tree.firstChild();
}
++nTrees;
for (Tree subTree : tree) {
if (subTree.isPhrasal()) {
if (subTree.numChildren() > 1) {
++nBinaryRules;
branchingFactors.add(subTree.numChildren());
binaryBranchingFactors += subTree.numChildren();
binaryRuleTypes.incrementCount(treeToRuleString(subTree));
} else {
++nUnaryRules;
}
}
}
}
double mean = (double) binaryBranchingFactors / (double) nBinaryRules;
System.out.printf("#trees:\t%d%n", nTrees);
System.out.printf("#binary:\t%d%n", nBinaryRules);
System.out.printf("#binary types:\t%d%n", binaryRuleTypes.keySet().size());
System.out.printf("mean branching:\t%.4f%n", mean);
System.out.printf("stddev branching:\t%.4f%n", standardDeviation(branchingFactors, mean));
System.out.printf("rule entropy:\t%.5f%n", Counters.entropy(binaryRuleTypes));
System.out.printf("#unaries:\t%d%n", nUnaryRules);
}
Aggregations