use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class TsarfatyEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
int maxGuessYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
boolean skipGuess = false;
boolean tagMode = false;
String guessFile = null;
String goldFile = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-y":
maxGoldYield = Integer.parseInt(args[++i].trim());
break;
case "-t":
tagMode = true;
break;
case "-v":
VERBOSE = true;
break;
case "-g":
maxGuessYield = Integer.parseInt(args[++i].trim());
skipGuess = true;
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
//Required parameters
goldFile = args[i++];
guessFile = args[i];
break;
}
}
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
final ArrayList<Label> guessSent = guess.yield();
final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
if (guessSent.size() > maxGuessYield) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
final ArrayList<Label> goldSent = gold.yield();
final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
if (goldSent.size() > maxGoldYield) {
continue;
} else if (goldChars.length() != guessChars.length()) {
pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
eval.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class Options method setOptionFlag.
/**
* Set an option in this object, based on a String array in the style of
* commandline flags. The option is only processed with respect to
* options directly known by the Options object.
* Some options (there are many others; see the source code):
* <ul>
* <li> <code>-maxLength n</code> set the maximum length sentence to parse (inclusively)
* <li> <code>-printTT</code> print the training trees in raw, annotated, and annotated+binarized form. Useful for debugging and other miscellany.
* <li> <code>-printAnnotated filename</code> use only in conjunction with -printTT. Redirects printing of annotated training trees to <code>filename</code>.
* <li> <code>-forceTags</code> when the parser is tested against a set of gold standard trees, use the tagged yield, instead of just the yield, as input.
* </ul>
*
* @param args An array of options arguments, command-line style. E.g. {"-maxLength", "50"}.
* @param i The index in args to start at when processing an option
* @return The index in args of the position after the last element used in
* processing this option, or the value i unchanged if a valid option couldn't
* be processed starting at position i.
*/
protected int setOptionFlag(String[] args, int i) {
if (args[i].equalsIgnoreCase("-PCFG")) {
doDep = false;
doPCFG = true;
i++;
} else if (args[i].equalsIgnoreCase("-dep")) {
doDep = true;
doPCFG = false;
i++;
} else if (args[i].equalsIgnoreCase("-factored")) {
doDep = true;
doPCFG = true;
testOptions.useFastFactored = false;
i++;
} else if (args[i].equalsIgnoreCase("-fastFactored")) {
doDep = true;
doPCFG = true;
testOptions.useFastFactored = true;
i++;
} else if (args[i].equalsIgnoreCase("-noRecoveryTagging")) {
testOptions.noRecoveryTagging = true;
i++;
} else if (args[i].equalsIgnoreCase("-useLexiconToScoreDependencyPwGt")) {
testOptions.useLexiconToScoreDependencyPwGt = true;
i++;
} else if (args[i].equalsIgnoreCase("-useSmoothTagProjection")) {
useSmoothTagProjection = true;
i++;
} else if (args[i].equalsIgnoreCase("-useUnigramWordSmoothing")) {
useUnigramWordSmoothing = true;
i++;
} else if (args[i].equalsIgnoreCase("-useNonProjectiveDependencyParser")) {
testOptions.useNonProjectiveDependencyParser = true;
i++;
} else if (args[i].equalsIgnoreCase("-maxLength") && (i + 1 < args.length)) {
testOptions.maxLength = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-MAX_ITEMS") && (i + 1 < args.length)) {
testOptions.MAX_ITEMS = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-trainLength") && (i + 1 < args.length)) {
// train on only short sentences
trainOptions.trainLengthLimit = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-lengthNormalization")) {
testOptions.lengthNormalization = true;
i++;
} else if (args[i].equalsIgnoreCase("-iterativeCKY")) {
testOptions.iterativeCKY = true;
i++;
} else if (args[i].equalsIgnoreCase("-vMarkov") && (i + 1 < args.length)) {
int order = Integer.parseInt(args[i + 1]);
if (order <= 1) {
trainOptions.PA = false;
trainOptions.gPA = false;
} else if (order == 2) {
trainOptions.PA = true;
trainOptions.gPA = false;
} else if (order >= 3) {
trainOptions.PA = true;
trainOptions.gPA = true;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-vSelSplitCutOff") && (i + 1 < args.length)) {
trainOptions.selectiveSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.selectiveSplit = trainOptions.selectiveSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-vSelPostSplitCutOff") && (i + 1 < args.length)) {
trainOptions.selectivePostSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.selectivePostSplit = trainOptions.selectivePostSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-deleteSplitters") && (i + 1 < args.length)) {
String[] toDel = args[i + 1].split(" *, *");
trainOptions.deleteSplitters = Generics.newHashSet(Arrays.asList(toDel));
i += 2;
} else if (args[i].equalsIgnoreCase("-postSplitWithBaseCategory")) {
trainOptions.postSplitWithBaseCategory = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-vPostMarkov") && (i + 1 < args.length)) {
int order = Integer.parseInt(args[i + 1]);
if (order <= 1) {
trainOptions.postPA = false;
trainOptions.postGPA = false;
} else if (order == 2) {
trainOptions.postPA = true;
trainOptions.postGPA = false;
} else if (order >= 3) {
trainOptions.postPA = true;
trainOptions.postGPA = true;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-hMarkov") && (i + 1 < args.length)) {
int order = Integer.parseInt(args[i + 1]);
if (order >= 0) {
trainOptions.markovOrder = order;
trainOptions.markovFactor = true;
} else {
trainOptions.markovFactor = false;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-distanceBins") && (i + 1 < args.length)) {
int numBins = Integer.parseInt(args[i + 1]);
if (numBins <= 1) {
distance = false;
} else if (numBins == 4) {
distance = true;
coarseDistance = true;
} else if (numBins == 5) {
distance = true;
coarseDistance = false;
} else {
throw new IllegalArgumentException("Invalid value for -distanceBin: " + args[i + 1]);
}
i += 2;
} else if (args[i].equalsIgnoreCase("-noStop")) {
genStop = false;
i++;
} else if (args[i].equalsIgnoreCase("-nonDirectional")) {
directional = false;
i++;
} else if (args[i].equalsIgnoreCase("-depWeight") && (i + 1 < args.length)) {
testOptions.depWeight = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-printPCFGkBest") && (i + 1 < args.length)) {
testOptions.printPCFGkBest = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-evalPCFGkBest") && (i + 1 < args.length)) {
testOptions.evalPCFGkBest = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-printFactoredKGood") && (i + 1 < args.length)) {
testOptions.printFactoredKGood = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-smoothTagsThresh") && (i + 1 < args.length)) {
lexOptions.smoothInUnknownsThreshold = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unseenSmooth") && (i + 1 < args.length)) {
testOptions.unseenSmooth = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-fractionBeforeUnseenCounting") && (i + 1 < args.length)) {
trainOptions.fractionBeforeUnseenCounting = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-hSelSplitThresh") && (i + 1 < args.length)) {
trainOptions.HSEL_CUT = Integer.parseInt(args[i + 1]);
trainOptions.hSelSplit = trainOptions.HSEL_CUT > 0;
i += 2;
} else if (args[i].equalsIgnoreCase("-nohSelSplit")) {
trainOptions.hSelSplit = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-tagPA")) {
trainOptions.tagPA = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noTagPA")) {
trainOptions.tagPA = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-tagSelSplitCutOff") && (i + 1 < args.length)) {
trainOptions.tagSelectiveSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.tagSelectiveSplit = trainOptions.tagSelectiveSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-tagSelPostSplitCutOff") && (i + 1 < args.length)) {
trainOptions.tagSelectivePostSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.tagSelectivePostSplit = trainOptions.tagSelectivePostSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-noTagSplit")) {
trainOptions.noTagSplit = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-uwm") && (i + 1 < args.length)) {
lexOptions.useUnknownWordSignatures = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unknownSuffixSize") && (i + 1 < args.length)) {
lexOptions.unknownSuffixSize = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unknownPrefixSize") && (i + 1 < args.length)) {
lexOptions.unknownPrefixSize = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-uwModelTrainer") && (i + 1 < args.length)) {
lexOptions.uwModelTrainer = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-openClassThreshold") && (i + 1 < args.length)) {
trainOptions.openClassTypesThreshold = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unary") && i + 1 < args.length) {
trainOptions.markUnary = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unaryTags")) {
trainOptions.markUnaryTags = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-mutate")) {
lexOptions.smartMutation = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-useUnicodeType")) {
lexOptions.useUnicodeType = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-rightRec")) {
trainOptions.rightRec = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noRightRec")) {
trainOptions.rightRec = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-preTag")) {
testOptions.preTag = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-forceTags")) {
testOptions.forceTags = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-taggerSerializedFile")) {
testOptions.taggerSerializedFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-forceTagBeginnings")) {
testOptions.forceTagBeginnings = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noFunctionalForcing")) {
testOptions.noFunctionalForcing = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-scTags")) {
dcTags = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-dcTags")) {
dcTags = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-basicCategoryTagsInDependencyGrammar")) {
trainOptions.basicCategoryTagsInDependencyGrammar = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-evalb")) {
testOptions.evalb = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-v") || args[i].equalsIgnoreCase("-verbose")) {
testOptions.verbose = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-outputFilesDirectory") && i + 1 < args.length) {
testOptions.outputFilesDirectory = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputFilesExtension") && i + 1 < args.length) {
testOptions.outputFilesExtension = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputFilesPrefix") && i + 1 < args.length) {
testOptions.outputFilesPrefix = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputkBestEquivocation") && i + 1 < args.length) {
testOptions.outputkBestEquivocation = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-writeOutputFiles")) {
testOptions.writeOutputFiles = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-printAllBestParses")) {
testOptions.printAllBestParses = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-outputTreeFormat") || args[i].equalsIgnoreCase("-outputFormat")) {
testOptions.outputFormat = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputTreeFormatOptions") || args[i].equalsIgnoreCase("-outputFormatOptions")) {
testOptions.outputFormatOptions = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-addMissingFinalPunctuation")) {
testOptions.addMissingFinalPunctuation = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-flexiTag")) {
lexOptions.flexiTag = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-lexiTag")) {
lexOptions.flexiTag = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-useSignatureForKnownSmoothing")) {
lexOptions.useSignatureForKnownSmoothing = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-wordClassesFile")) {
lexOptions.wordClassesFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-compactGrammar")) {
trainOptions.compactGrammar = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markFinalStates")) {
trainOptions.markFinalStates = args[i + 1].equalsIgnoreCase("true");
i += 2;
} else if (args[i].equalsIgnoreCase("-leftToRight")) {
trainOptions.leftToRight = args[i + 1].equals("true");
i += 2;
} else if (args[i].equalsIgnoreCase("-cnf")) {
forceCNF = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-smoothRules")) {
trainOptions.ruleSmoothing = true;
trainOptions.ruleSmoothingAlpha = Double.valueOf(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-nodePrune") && i + 1 < args.length) {
nodePrune = args[i + 1].equalsIgnoreCase("true");
i += 2;
} else if (args[i].equalsIgnoreCase("-noDoRecovery")) {
testOptions.doRecovery = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-acl03chinese")) {
trainOptions.markovOrder = 1;
trainOptions.markovFactor = true;
// no increment
} else if (args[i].equalsIgnoreCase("-wordFunction")) {
wordFunction = ReflectionLoading.loadByReflection(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-acl03pcfg")) {
doDep = false;
doPCFG = true;
// lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = true;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = true;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = true;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-jenny")) {
doDep = false;
doPCFG = true;
// lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = false;
trainOptions.gPA = false;
trainOptions.tagPA = false;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = true;
trainOptions.selectiveSplit = false;
// trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = false;
// trainOptions.markovOrder = 2;
trainOptions.hSelSplit = false;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = true;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-goodPCFG")) {
doDep = false;
doPCFG = true;
// op.lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = true;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = true;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = true;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
String[] delSplit = { "-deleteSplitters", "VP^NP,VP^VP,VP^SINV,VP^SQ" };
if (this.setOptionFlag(delSplit, 0) != 2) {
log.info("Error processing deleteSplitters");
}
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-linguisticPCFG")) {
doDep = false;
doPCFG = true;
// op.lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = true;
trainOptions.gPA = false;
// on at the moment, but iffy
trainOptions.tagPA = true;
trainOptions.tagSelectiveSplit = false;
// not for linguistic
trainOptions.rightRec = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
// different from acl03pcfg
lexOptions.useUnknownWordSignatures = 5;
// different from acl03pcfg
lexOptions.flexiTag = false;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-ijcai03")) {
doDep = true;
doPCFG = true;
trainOptions.markUnary = 0;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = false;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
/// cdm: May 2005 compacting bad for factored?
trainOptions.compactGrammar = 0;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = false;
dcTags = true;
// op.nodePrune = true; // cdm: May 2005: this doesn't help
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-goodFactored")) {
doDep = true;
doPCFG = true;
trainOptions.markUnary = 0;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = false;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
/// cdm: May 2005 compacting bad for factored?
trainOptions.compactGrammar = 0;
// different from ijcai03
lexOptions.useUnknownWordSignatures = 5;
lexOptions.flexiTag = false;
dcTags = true;
// op.nodePrune = true; // cdm: May 2005: this doesn't help
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-chineseFactored")) {
// Single counting tag->word rewrite is also much better for Chinese
// Factored. Bracketing F1 goes up about 0.7%.
dcTags = false;
lexOptions.useUnicodeType = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
trainOptions.markovFactor = true;
trainOptions.HSEL_CUT = 50;
// trainOptions.openClassTypesThreshold=1; // so can get unseen punctuation
// trainOptions.fractionBeforeUnseenCounting=0.0; // so can get unseen punctuation
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-arabicFactored")) {
doDep = true;
doPCFG = true;
// "false" seems to help Arabic about 0.1% F1
dcTags = false;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
// 75 bit better than 50, 100 a bit worse
trainOptions.HSEL_CUT = 75;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
// Helps PCFG and marginally factLB
trainOptions.markUnary = 1;
// trainOptions.compactGrammar = 0; // Doesn't seem to help or only 0.05% F1
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
// Arabic sentences are long enough that this helps a fraction
testOptions.MAX_ITEMS = 500000;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-frenchFactored")) {
doDep = true;
doPCFG = true;
//wsg2011: Setting to false improves F1 by 0.5%
dcTags = false;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
trainOptions.HSEL_CUT = 75;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
//Unary rule marking bad for french..setting to 0 gives +0.3 F1
trainOptions.markUnary = 0;
lexOptions.useUnknownWordSignatures = 1;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 2;
} else if (args[i].equalsIgnoreCase("-chinesePCFG")) {
trainOptions.markovOrder = 2;
trainOptions.markovFactor = true;
trainOptions.HSEL_CUT = 5;
trainOptions.PA = true;
trainOptions.gPA = true;
trainOptions.selectiveSplit = false;
doDep = false;
doPCFG = true;
// Single counting tag->word rewrite is also much better for Chinese PCFG
// Bracketing F1 is up about 2% and tag accuracy about 1% (exact by 6%)
dcTags = false;
// no increment
} else if (args[i].equalsIgnoreCase("-printTT") && (i + 1 < args.length)) {
trainOptions.printTreeTransformations = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-printAnnotatedRuleCounts")) {
trainOptions.printAnnotatedRuleCounts = true;
i++;
} else if (args[i].equalsIgnoreCase("-printAnnotatedStateCounts")) {
trainOptions.printAnnotatedStateCounts = true;
i++;
} else if (args[i].equalsIgnoreCase("-printAnnotated") && (i + 1 < args.length)) {
try {
trainOptions.printAnnotatedPW = tlpParams.pw(new FileOutputStream(args[i + 1]));
} catch (IOException ioe) {
trainOptions.printAnnotatedPW = null;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-printBinarized") && (i + 1 < args.length)) {
try {
trainOptions.printBinarizedPW = tlpParams.pw(new FileOutputStream(args[i + 1]));
} catch (IOException ioe) {
trainOptions.printBinarizedPW = null;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-printStates")) {
trainOptions.printStates = true;
i++;
} else if (args[i].equalsIgnoreCase("-preTransformer") && (i + 1 < args.length)) {
String[] classes = args[i + 1].split(",");
i += 2;
if (classes.length == 1) {
trainOptions.preTransformer = ReflectionLoading.loadByReflection(classes[0], this);
} else if (classes.length > 1) {
CompositeTreeTransformer composite = new CompositeTreeTransformer();
trainOptions.preTransformer = composite;
for (String clazz : classes) {
TreeTransformer transformer = ReflectionLoading.loadByReflection(clazz, this);
composite.addTransformer(transformer);
}
}
} else if (args[i].equalsIgnoreCase("-taggedFiles") && (i + 1 < args.length)) {
trainOptions.taggedFiles = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-predictSplits")) {
// This is an experimental (and still in development)
// reimplementation of Berkeley's state splitting grammar.
trainOptions.predictSplits = true;
trainOptions.compactGrammar = 0;
i++;
} else if (args[i].equalsIgnoreCase("-splitCount")) {
trainOptions.splitCount = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-splitRecombineRate")) {
trainOptions.splitRecombineRate = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-trainingThreads") || args[i].equalsIgnoreCase("-nThreads")) {
trainOptions.trainingThreads = Integer.parseInt(args[i + 1]);
testOptions.testingThreads = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-testingThreads")) {
testOptions.testingThreads = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-evals")) {
testOptions.evals = StringUtils.stringToProperties(args[i + 1], testOptions.evals);
i += 2;
} else if (args[i].equalsIgnoreCase("-fastFactoredCandidateMultiplier")) {
testOptions.fastFactoredCandidateMultiplier = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-fastFactoredCandidateAddend")) {
testOptions.fastFactoredCandidateAddend = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-quietEvaluation")) {
testOptions.quietEvaluation = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noquietEvaluation")) {
testOptions.quietEvaluation = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-simpleBinarizedLabels")) {
trainOptions.simpleBinarizedLabels = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noRebinarization")) {
trainOptions.noRebinarization = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-dvKBest")) {
trainOptions.dvKBest = Integer.parseInt(args[i + 1]);
rerankerKBest = trainOptions.dvKBest;
i += 2;
} else if (args[i].equalsIgnoreCase("-regCost")) {
trainOptions.regCost = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-dvIterations") || args[i].equalsIgnoreCase("-trainingIterations")) {
trainOptions.trainingIterations = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-stalledIterationLimit")) {
trainOptions.stalledIterationLimit = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-dvBatchSize") || args[i].equalsIgnoreCase("-batchSize")) {
trainOptions.batchSize = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-qnIterationsPerBatch")) {
trainOptions.qnIterationsPerBatch = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-qnEstimates")) {
trainOptions.qnEstimates = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-qnTolerance")) {
trainOptions.qnTolerance = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-debugOutputFrequency")) {
trainOptions.debugOutputFrequency = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-maxTrainTimeSeconds")) {
trainOptions.maxTrainTimeSeconds = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-dvSeed") || args[i].equalsIgnoreCase("-randomSeed")) {
trainOptions.randomSeed = Long.parseLong(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-wordVectorFile")) {
lexOptions.wordVectorFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-numHid")) {
lexOptions.numHid = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-learningRate")) {
trainOptions.learningRate = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-deltaMargin")) {
trainOptions.deltaMargin = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unknownNumberVector")) {
trainOptions.unknownNumberVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownNumberVector")) {
trainOptions.unknownNumberVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownDashedWordVectors")) {
trainOptions.unknownDashedWordVectors = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownDashedWordVectors")) {
trainOptions.unknownDashedWordVectors = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownCapsVector")) {
trainOptions.unknownCapsVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownCapsVector")) {
trainOptions.unknownCapsVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownChineseYearVector")) {
trainOptions.unknownChineseYearVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownChineseYearVector")) {
trainOptions.unknownChineseYearVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownChineseNumberVector")) {
trainOptions.unknownChineseNumberVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownChineseNumberVector")) {
trainOptions.unknownChineseNumberVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownChinesePercentVector")) {
trainOptions.unknownChinesePercentVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownChinesePercentVector")) {
trainOptions.unknownChinesePercentVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-dvSimplifiedModel")) {
trainOptions.dvSimplifiedModel = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-scalingForInit")) {
trainOptions.scalingForInit = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-rerankerKBest")) {
rerankerKBest = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-baseParserWeight")) {
baseParserWeight = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unkWord")) {
trainOptions.unkWord = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-lowercaseWordVectors")) {
trainOptions.lowercaseWordVectors = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noLowercaseWordVectors")) {
trainOptions.lowercaseWordVectors = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-transformMatrixType")) {
trainOptions.transformMatrixType = TrainOptions.TransformMatrixType.valueOf(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-useContextWords")) {
trainOptions.useContextWords = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUseContextWords")) {
trainOptions.useContextWords = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-trainWordVectors")) {
trainOptions.trainWordVectors = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noTrainWordVectors")) {
trainOptions.trainWordVectors = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-markStrahler")) {
trainOptions.markStrahler = true;
i += 1;
}
return i;
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class CollinsDepEval method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length < MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optionArgDefs());
boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
Language LANGUAGE = PropertiesUtils.get(options, "l", Language.English, Language.class);
int MAX_GOLD_YIELD = PropertiesUtils.getInt(options, "g", Integer.MAX_VALUE);
int MAX_GUESS_YIELD = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
File goldFile = new File(parsedArgs[0]);
File guessFile = new File(parsedArgs[1]);
final TreebankLangParserParams tlpp = LANGUAGE.params;
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
if (guess.yield().size() > MAX_GUESS_YIELD) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
if (gold.yield().size() > MAX_GOLD_YIELD) {
continue;
} else if (evalGold.yield().size() != evalGuess.yield().size()) {
pwOut.println("Yield mismatch at gold line " + goldLineId);
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
depEval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((MAX_GUESS_YIELD < Integer.MAX_VALUE) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
depEval.display(true, pwOut);
pwOut.close();
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class LeafAncestorEval method main.
/**
* Execute with no arguments for usage.
*/
public static void main(String[] args) {
if (!validateCommandLine(args)) {
log.info(USAGE);
System.exit(-1);
}
final TreebankLangParserParams tlpp = LANGUAGE.params;
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final LeafAncestorEval metric = new LeafAncestorEval("LeafAncestor");
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > MAX_GOLD_YIELD) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
final Tree evalGold = tc.transformTree(goldTree);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.close();
}
use of edu.stanford.nlp.trees.TreeTransformer in project CoreNLP by stanfordnlp.
the class ATBCorrector method main.
//For those trees that lack a sentence-final punc, add one.
// ("/^[^\\.!\\?]$/ >>- (__ > @ROOT <- __=loc) <: __\n"
// + "insert (PUNC .) $- loc\n"
// + "\n");
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + ATBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new ATBCorrector();
File f = new File(args[0]);
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations