use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.
the class TsarfatyEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
int maxGuessYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
boolean skipGuess = false;
boolean tagMode = false;
String guessFile = null;
String goldFile = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-y":
maxGoldYield = Integer.parseInt(args[++i].trim());
break;
case "-t":
tagMode = true;
break;
case "-v":
VERBOSE = true;
break;
case "-g":
maxGuessYield = Integer.parseInt(args[++i].trim());
skipGuess = true;
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
//Required parameters
goldFile = args[i++];
guessFile = args[i];
break;
}
}
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
final ArrayList<Label> guessSent = guess.yield();
final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
if (guessSent.size() > maxGuessYield) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
final ArrayList<Label> goldSent = gold.yield();
final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
if (goldSent.size() > maxGoldYield) {
continue;
} else if (goldChars.length() != guessChars.length()) {
pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
eval.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.
the class PunctFrequencyDist method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
String puncTag = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
puncTag = args[i++];
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
Counter<String> puncTypes = new ClassicCounter<>();
for (Tree t : tb) {
List<CoreLabel> yield = t.taggedLabeledYield();
for (CoreLabel word : yield) if (word.tag().equals(puncTag))
puncTypes.incrementCount(word.word());
}
List<String> biggestKeys = new ArrayList<>(puncTypes.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(puncTypes));
PrintWriter pw = tlpp.pw();
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) puncTypes.getCount(wordType));
pw.close();
}
use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.
the class UNKPrinter method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
Language lang = Language.English;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
PrintWriter pw = tlpp.pw();
Options op = new Options();
Options.LexOptions lexOptions = op.lexOptions;
if (lang == Language.French) {
lexOptions.useUnknownWordSignatures = 1;
lexOptions.smartMutation = false;
lexOptions.unknownSuffixSize = 2;
lexOptions.unknownPrefixSize = 1;
} else if (lang == Language.Arabic) {
lexOptions.smartMutation = false;
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
}
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
int computeAfter = (int) (0.50 * tb.size());
Counter<String> vocab = new ClassicCounter<>();
Counter<String> unkCounter = new ClassicCounter<>();
int treeId = 0;
for (Tree t : tb) {
List<Label> yield = t.yield();
int posId = 0;
for (Label word : yield) {
vocab.incrementCount(word.value());
if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
// if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
// pw.println(word.value());
unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
}
treeId++;
}
List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
pw.close();
pw.close();
}
use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.
the class VocabFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
Counter<String> vocab = new ClassicCounter<>();
for (Tree t : tb) {
List<Label> yield = t.yield();
for (Label word : yield) vocab.incrementCount(word.value());
}
List<String> biggestKeys = new ArrayList<>(vocab.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(vocab));
PrintWriter pw = tlpp.pw();
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) vocab.getCount(wordType));
pw.close();
}
use of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams in project CoreNLP by stanfordnlp.
the class RHSFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
TregexPattern rootMatch = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
rootMatch = TregexPattern.compile("@" + args[i++]);
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i++]);
}
}
Counter<String> rhsCounter = new ClassicCounter<>();
for (Tree t : tb) {
TregexMatcher m = rootMatch.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
StringBuilder sb = new StringBuilder();
for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
rhsCounter.incrementCount(sb.toString().trim());
}
}
List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
PrintWriter pw = tlpp.pw();
for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
pw.close();
}
Aggregations