use of edu.stanford.nlp.trees.CompositeTreeTransformer in project CoreNLP by stanfordnlp.
the class Options method setOptionFlag.
/**
* Set an option in this object, based on a String array in the style of
* commandline flags. The option is only processed with respect to
* options directly known by the Options object.
* Some options (there are many others; see the source code):
* <ul>
* <li> <code>-maxLength n</code> set the maximum length sentence to parse (inclusively)
* <li> <code>-printTT</code> print the training trees in raw, annotated, and annotated+binarized form. Useful for debugging and other miscellany.
* <li> <code>-printAnnotated filename</code> use only in conjunction with -printTT. Redirects printing of annotated training trees to <code>filename</code>.
* <li> <code>-forceTags</code> when the parser is tested against a set of gold standard trees, use the tagged yield, instead of just the yield, as input.
* </ul>
*
* @param args An array of options arguments, command-line style. E.g. {"-maxLength", "50"}.
* @param i The index in args to start at when processing an option
* @return The index in args of the position after the last element used in
* processing this option, or the value i unchanged if a valid option couldn't
* be processed starting at position i.
*/
protected int setOptionFlag(String[] args, int i) {
if (args[i].equalsIgnoreCase("-PCFG")) {
doDep = false;
doPCFG = true;
i++;
} else if (args[i].equalsIgnoreCase("-dep")) {
doDep = true;
doPCFG = false;
i++;
} else if (args[i].equalsIgnoreCase("-factored")) {
doDep = true;
doPCFG = true;
testOptions.useFastFactored = false;
i++;
} else if (args[i].equalsIgnoreCase("-fastFactored")) {
doDep = true;
doPCFG = true;
testOptions.useFastFactored = true;
i++;
} else if (args[i].equalsIgnoreCase("-noRecoveryTagging")) {
testOptions.noRecoveryTagging = true;
i++;
} else if (args[i].equalsIgnoreCase("-useLexiconToScoreDependencyPwGt")) {
testOptions.useLexiconToScoreDependencyPwGt = true;
i++;
} else if (args[i].equalsIgnoreCase("-useSmoothTagProjection")) {
useSmoothTagProjection = true;
i++;
} else if (args[i].equalsIgnoreCase("-useUnigramWordSmoothing")) {
useUnigramWordSmoothing = true;
i++;
} else if (args[i].equalsIgnoreCase("-useNonProjectiveDependencyParser")) {
testOptions.useNonProjectiveDependencyParser = true;
i++;
} else if (args[i].equalsIgnoreCase("-maxLength") && (i + 1 < args.length)) {
testOptions.maxLength = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-MAX_ITEMS") && (i + 1 < args.length)) {
testOptions.MAX_ITEMS = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-trainLength") && (i + 1 < args.length)) {
// train on only short sentences
trainOptions.trainLengthLimit = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-lengthNormalization")) {
testOptions.lengthNormalization = true;
i++;
} else if (args[i].equalsIgnoreCase("-iterativeCKY")) {
testOptions.iterativeCKY = true;
i++;
} else if (args[i].equalsIgnoreCase("-vMarkov") && (i + 1 < args.length)) {
int order = Integer.parseInt(args[i + 1]);
if (order <= 1) {
trainOptions.PA = false;
trainOptions.gPA = false;
} else if (order == 2) {
trainOptions.PA = true;
trainOptions.gPA = false;
} else if (order >= 3) {
trainOptions.PA = true;
trainOptions.gPA = true;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-vSelSplitCutOff") && (i + 1 < args.length)) {
trainOptions.selectiveSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.selectiveSplit = trainOptions.selectiveSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-vSelPostSplitCutOff") && (i + 1 < args.length)) {
trainOptions.selectivePostSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.selectivePostSplit = trainOptions.selectivePostSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-deleteSplitters") && (i + 1 < args.length)) {
String[] toDel = args[i + 1].split(" *, *");
trainOptions.deleteSplitters = Generics.newHashSet(Arrays.asList(toDel));
i += 2;
} else if (args[i].equalsIgnoreCase("-postSplitWithBaseCategory")) {
trainOptions.postSplitWithBaseCategory = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-vPostMarkov") && (i + 1 < args.length)) {
int order = Integer.parseInt(args[i + 1]);
if (order <= 1) {
trainOptions.postPA = false;
trainOptions.postGPA = false;
} else if (order == 2) {
trainOptions.postPA = true;
trainOptions.postGPA = false;
} else if (order >= 3) {
trainOptions.postPA = true;
trainOptions.postGPA = true;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-hMarkov") && (i + 1 < args.length)) {
int order = Integer.parseInt(args[i + 1]);
if (order >= 0) {
trainOptions.markovOrder = order;
trainOptions.markovFactor = true;
} else {
trainOptions.markovFactor = false;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-distanceBins") && (i + 1 < args.length)) {
int numBins = Integer.parseInt(args[i + 1]);
if (numBins <= 1) {
distance = false;
} else if (numBins == 4) {
distance = true;
coarseDistance = true;
} else if (numBins == 5) {
distance = true;
coarseDistance = false;
} else {
throw new IllegalArgumentException("Invalid value for -distanceBin: " + args[i + 1]);
}
i += 2;
} else if (args[i].equalsIgnoreCase("-noStop")) {
genStop = false;
i++;
} else if (args[i].equalsIgnoreCase("-nonDirectional")) {
directional = false;
i++;
} else if (args[i].equalsIgnoreCase("-depWeight") && (i + 1 < args.length)) {
testOptions.depWeight = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-printPCFGkBest") && (i + 1 < args.length)) {
testOptions.printPCFGkBest = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-evalPCFGkBest") && (i + 1 < args.length)) {
testOptions.evalPCFGkBest = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-printFactoredKGood") && (i + 1 < args.length)) {
testOptions.printFactoredKGood = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-smoothTagsThresh") && (i + 1 < args.length)) {
lexOptions.smoothInUnknownsThreshold = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unseenSmooth") && (i + 1 < args.length)) {
testOptions.unseenSmooth = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-fractionBeforeUnseenCounting") && (i + 1 < args.length)) {
trainOptions.fractionBeforeUnseenCounting = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-hSelSplitThresh") && (i + 1 < args.length)) {
trainOptions.HSEL_CUT = Integer.parseInt(args[i + 1]);
trainOptions.hSelSplit = trainOptions.HSEL_CUT > 0;
i += 2;
} else if (args[i].equalsIgnoreCase("-nohSelSplit")) {
trainOptions.hSelSplit = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-tagPA")) {
trainOptions.tagPA = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noTagPA")) {
trainOptions.tagPA = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-tagSelSplitCutOff") && (i + 1 < args.length)) {
trainOptions.tagSelectiveSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.tagSelectiveSplit = trainOptions.tagSelectiveSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-tagSelPostSplitCutOff") && (i + 1 < args.length)) {
trainOptions.tagSelectivePostSplitCutOff = Double.parseDouble(args[i + 1]);
trainOptions.tagSelectivePostSplit = trainOptions.tagSelectivePostSplitCutOff > 0.0;
i += 2;
} else if (args[i].equalsIgnoreCase("-noTagSplit")) {
trainOptions.noTagSplit = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-uwm") && (i + 1 < args.length)) {
lexOptions.useUnknownWordSignatures = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unknownSuffixSize") && (i + 1 < args.length)) {
lexOptions.unknownSuffixSize = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unknownPrefixSize") && (i + 1 < args.length)) {
lexOptions.unknownPrefixSize = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-uwModelTrainer") && (i + 1 < args.length)) {
lexOptions.uwModelTrainer = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-openClassThreshold") && (i + 1 < args.length)) {
trainOptions.openClassTypesThreshold = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unary") && i + 1 < args.length) {
trainOptions.markUnary = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unaryTags")) {
trainOptions.markUnaryTags = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-mutate")) {
lexOptions.smartMutation = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-useUnicodeType")) {
lexOptions.useUnicodeType = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-rightRec")) {
trainOptions.rightRec = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noRightRec")) {
trainOptions.rightRec = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-preTag")) {
testOptions.preTag = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-forceTags")) {
testOptions.forceTags = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-taggerSerializedFile")) {
testOptions.taggerSerializedFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-forceTagBeginnings")) {
testOptions.forceTagBeginnings = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noFunctionalForcing")) {
testOptions.noFunctionalForcing = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-scTags")) {
dcTags = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-dcTags")) {
dcTags = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-basicCategoryTagsInDependencyGrammar")) {
trainOptions.basicCategoryTagsInDependencyGrammar = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-evalb")) {
testOptions.evalb = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-v") || args[i].equalsIgnoreCase("-verbose")) {
testOptions.verbose = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-outputFilesDirectory") && i + 1 < args.length) {
testOptions.outputFilesDirectory = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputFilesExtension") && i + 1 < args.length) {
testOptions.outputFilesExtension = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputFilesPrefix") && i + 1 < args.length) {
testOptions.outputFilesPrefix = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputkBestEquivocation") && i + 1 < args.length) {
testOptions.outputkBestEquivocation = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-writeOutputFiles")) {
testOptions.writeOutputFiles = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-printAllBestParses")) {
testOptions.printAllBestParses = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-outputTreeFormat") || args[i].equalsIgnoreCase("-outputFormat")) {
testOptions.outputFormat = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-outputTreeFormatOptions") || args[i].equalsIgnoreCase("-outputFormatOptions")) {
testOptions.outputFormatOptions = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-addMissingFinalPunctuation")) {
testOptions.addMissingFinalPunctuation = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-flexiTag")) {
lexOptions.flexiTag = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-lexiTag")) {
lexOptions.flexiTag = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-useSignatureForKnownSmoothing")) {
lexOptions.useSignatureForKnownSmoothing = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-wordClassesFile")) {
lexOptions.wordClassesFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-compactGrammar")) {
trainOptions.compactGrammar = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markFinalStates")) {
trainOptions.markFinalStates = args[i + 1].equalsIgnoreCase("true");
i += 2;
} else if (args[i].equalsIgnoreCase("-leftToRight")) {
trainOptions.leftToRight = args[i + 1].equals("true");
i += 2;
} else if (args[i].equalsIgnoreCase("-cnf")) {
forceCNF = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-smoothRules")) {
trainOptions.ruleSmoothing = true;
trainOptions.ruleSmoothingAlpha = Double.valueOf(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-nodePrune") && i + 1 < args.length) {
nodePrune = args[i + 1].equalsIgnoreCase("true");
i += 2;
} else if (args[i].equalsIgnoreCase("-noDoRecovery")) {
testOptions.doRecovery = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-acl03chinese")) {
trainOptions.markovOrder = 1;
trainOptions.markovFactor = true;
// no increment
} else if (args[i].equalsIgnoreCase("-wordFunction")) {
wordFunction = ReflectionLoading.loadByReflection(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-acl03pcfg")) {
doDep = false;
doPCFG = true;
// lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = true;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = true;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = true;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-jenny")) {
doDep = false;
doPCFG = true;
// lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = false;
trainOptions.gPA = false;
trainOptions.tagPA = false;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = true;
trainOptions.selectiveSplit = false;
// trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = false;
// trainOptions.markovOrder = 2;
trainOptions.hSelSplit = false;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = true;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-goodPCFG")) {
doDep = false;
doPCFG = true;
// op.lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = true;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = true;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = true;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
String[] delSplit = { "-deleteSplitters", "VP^NP,VP^VP,VP^SINV,VP^SQ" };
if (this.setOptionFlag(delSplit, 0) != 2) {
log.info("Error processing deleteSplitters");
}
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-linguisticPCFG")) {
doDep = false;
doPCFG = true;
// op.lexOptions.smoothInUnknownsThreshold = 30;
trainOptions.markUnary = 1;
trainOptions.PA = true;
trainOptions.gPA = false;
// on at the moment, but iffy
trainOptions.tagPA = true;
trainOptions.tagSelectiveSplit = false;
// not for linguistic
trainOptions.rightRec = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 400.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
// different from acl03pcfg
lexOptions.useUnknownWordSignatures = 5;
// different from acl03pcfg
lexOptions.flexiTag = false;
// DAN: Tag double-counting is BAD for PCFG-only parsing
dcTags = false;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-ijcai03")) {
doDep = true;
doPCFG = true;
trainOptions.markUnary = 0;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = false;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
/// cdm: May 2005 compacting bad for factored?
trainOptions.compactGrammar = 0;
lexOptions.useUnknownWordSignatures = 2;
lexOptions.flexiTag = false;
dcTags = true;
// op.nodePrune = true; // cdm: May 2005: this doesn't help
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-goodFactored")) {
doDep = true;
doPCFG = true;
trainOptions.markUnary = 0;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.tagPA = false;
trainOptions.tagSelectiveSplit = false;
trainOptions.rightRec = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
/// cdm: May 2005 compacting bad for factored?
trainOptions.compactGrammar = 0;
// different from ijcai03
lexOptions.useUnknownWordSignatures = 5;
lexOptions.flexiTag = false;
dcTags = true;
// op.nodePrune = true; // cdm: May 2005: this doesn't help
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-chineseFactored")) {
// Single counting tag->word rewrite is also much better for Chinese
// Factored. Bracketing F1 goes up about 0.7%.
dcTags = false;
lexOptions.useUnicodeType = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
trainOptions.markovFactor = true;
trainOptions.HSEL_CUT = 50;
// trainOptions.openClassTypesThreshold=1; // so can get unseen punctuation
// trainOptions.fractionBeforeUnseenCounting=0.0; // so can get unseen punctuation
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-arabicFactored")) {
doDep = true;
doPCFG = true;
// "false" seems to help Arabic about 0.1% F1
dcTags = false;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
// 75 bit better than 50, 100 a bit worse
trainOptions.HSEL_CUT = 75;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
// Helps PCFG and marginally factLB
trainOptions.markUnary = 1;
// trainOptions.compactGrammar = 0; // Doesn't seem to help or only 0.05% F1
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
// Arabic sentences are long enough that this helps a fraction
testOptions.MAX_ITEMS = 500000;
// don't increment i so it gets language specific stuff as well
} else if (args[i].equalsIgnoreCase("-frenchFactored")) {
doDep = true;
doPCFG = true;
//wsg2011: Setting to false improves F1 by 0.5%
dcTags = false;
trainOptions.markovFactor = true;
trainOptions.markovOrder = 2;
trainOptions.hSelSplit = true;
trainOptions.HSEL_CUT = 75;
trainOptions.PA = true;
trainOptions.gPA = false;
trainOptions.selectiveSplit = true;
trainOptions.selectiveSplitCutOff = 300.0;
//Unary rule marking bad for french..setting to 0 gives +0.3 F1
trainOptions.markUnary = 0;
lexOptions.useUnknownWordSignatures = 1;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 2;
} else if (args[i].equalsIgnoreCase("-chinesePCFG")) {
trainOptions.markovOrder = 2;
trainOptions.markovFactor = true;
trainOptions.HSEL_CUT = 5;
trainOptions.PA = true;
trainOptions.gPA = true;
trainOptions.selectiveSplit = false;
doDep = false;
doPCFG = true;
// Single counting tag->word rewrite is also much better for Chinese PCFG
// Bracketing F1 is up about 2% and tag accuracy about 1% (exact by 6%)
dcTags = false;
// no increment
} else if (args[i].equalsIgnoreCase("-printTT") && (i + 1 < args.length)) {
trainOptions.printTreeTransformations = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-printAnnotatedRuleCounts")) {
trainOptions.printAnnotatedRuleCounts = true;
i++;
} else if (args[i].equalsIgnoreCase("-printAnnotatedStateCounts")) {
trainOptions.printAnnotatedStateCounts = true;
i++;
} else if (args[i].equalsIgnoreCase("-printAnnotated") && (i + 1 < args.length)) {
try {
trainOptions.printAnnotatedPW = tlpParams.pw(new FileOutputStream(args[i + 1]));
} catch (IOException ioe) {
trainOptions.printAnnotatedPW = null;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-printBinarized") && (i + 1 < args.length)) {
try {
trainOptions.printBinarizedPW = tlpParams.pw(new FileOutputStream(args[i + 1]));
} catch (IOException ioe) {
trainOptions.printBinarizedPW = null;
}
i += 2;
} else if (args[i].equalsIgnoreCase("-printStates")) {
trainOptions.printStates = true;
i++;
} else if (args[i].equalsIgnoreCase("-preTransformer") && (i + 1 < args.length)) {
String[] classes = args[i + 1].split(",");
i += 2;
if (classes.length == 1) {
trainOptions.preTransformer = ReflectionLoading.loadByReflection(classes[0], this);
} else if (classes.length > 1) {
CompositeTreeTransformer composite = new CompositeTreeTransformer();
trainOptions.preTransformer = composite;
for (String clazz : classes) {
TreeTransformer transformer = ReflectionLoading.loadByReflection(clazz, this);
composite.addTransformer(transformer);
}
}
} else if (args[i].equalsIgnoreCase("-taggedFiles") && (i + 1 < args.length)) {
trainOptions.taggedFiles = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-predictSplits")) {
// This is an experimental (and still in development)
// reimplementation of Berkeley's state splitting grammar.
trainOptions.predictSplits = true;
trainOptions.compactGrammar = 0;
i++;
} else if (args[i].equalsIgnoreCase("-splitCount")) {
trainOptions.splitCount = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-splitRecombineRate")) {
trainOptions.splitRecombineRate = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-trainingThreads") || args[i].equalsIgnoreCase("-nThreads")) {
trainOptions.trainingThreads = Integer.parseInt(args[i + 1]);
testOptions.testingThreads = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-testingThreads")) {
testOptions.testingThreads = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-evals")) {
testOptions.evals = StringUtils.stringToProperties(args[i + 1], testOptions.evals);
i += 2;
} else if (args[i].equalsIgnoreCase("-fastFactoredCandidateMultiplier")) {
testOptions.fastFactoredCandidateMultiplier = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-fastFactoredCandidateAddend")) {
testOptions.fastFactoredCandidateAddend = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-quietEvaluation")) {
testOptions.quietEvaluation = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noquietEvaluation")) {
testOptions.quietEvaluation = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-simpleBinarizedLabels")) {
trainOptions.simpleBinarizedLabels = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noRebinarization")) {
trainOptions.noRebinarization = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-dvKBest")) {
trainOptions.dvKBest = Integer.parseInt(args[i + 1]);
rerankerKBest = trainOptions.dvKBest;
i += 2;
} else if (args[i].equalsIgnoreCase("-regCost")) {
trainOptions.regCost = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-dvIterations") || args[i].equalsIgnoreCase("-trainingIterations")) {
trainOptions.trainingIterations = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-stalledIterationLimit")) {
trainOptions.stalledIterationLimit = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-dvBatchSize") || args[i].equalsIgnoreCase("-batchSize")) {
trainOptions.batchSize = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-qnIterationsPerBatch")) {
trainOptions.qnIterationsPerBatch = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-qnEstimates")) {
trainOptions.qnEstimates = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-qnTolerance")) {
trainOptions.qnTolerance = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-debugOutputFrequency")) {
trainOptions.debugOutputFrequency = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-maxTrainTimeSeconds")) {
trainOptions.maxTrainTimeSeconds = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-dvSeed") || args[i].equalsIgnoreCase("-randomSeed")) {
trainOptions.randomSeed = Long.parseLong(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-wordVectorFile")) {
lexOptions.wordVectorFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-numHid")) {
lexOptions.numHid = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-learningRate")) {
trainOptions.learningRate = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-deltaMargin")) {
trainOptions.deltaMargin = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unknownNumberVector")) {
trainOptions.unknownNumberVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownNumberVector")) {
trainOptions.unknownNumberVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownDashedWordVectors")) {
trainOptions.unknownDashedWordVectors = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownDashedWordVectors")) {
trainOptions.unknownDashedWordVectors = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownCapsVector")) {
trainOptions.unknownCapsVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownCapsVector")) {
trainOptions.unknownCapsVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownChineseYearVector")) {
trainOptions.unknownChineseYearVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownChineseYearVector")) {
trainOptions.unknownChineseYearVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownChineseNumberVector")) {
trainOptions.unknownChineseNumberVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownChineseNumberVector")) {
trainOptions.unknownChineseNumberVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-unknownChinesePercentVector")) {
trainOptions.unknownChinesePercentVector = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUnknownChinesePercentVector")) {
trainOptions.unknownChinesePercentVector = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-dvSimplifiedModel")) {
trainOptions.dvSimplifiedModel = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-scalingForInit")) {
trainOptions.scalingForInit = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-rerankerKBest")) {
rerankerKBest = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-baseParserWeight")) {
baseParserWeight = Double.parseDouble(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-unkWord")) {
trainOptions.unkWord = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-lowercaseWordVectors")) {
trainOptions.lowercaseWordVectors = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noLowercaseWordVectors")) {
trainOptions.lowercaseWordVectors = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-transformMatrixType")) {
trainOptions.transformMatrixType = TrainOptions.TransformMatrixType.valueOf(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-useContextWords")) {
trainOptions.useContextWords = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noUseContextWords")) {
trainOptions.useContextWords = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-trainWordVectors")) {
trainOptions.trainWordVectors = true;
i += 1;
} else if (args[i].equalsIgnoreCase("-noTrainWordVectors")) {
trainOptions.trainWordVectors = false;
i += 1;
} else if (args[i].equalsIgnoreCase("-markStrahler")) {
trainOptions.markStrahler = true;
i += 1;
}
return i;
}
use of edu.stanford.nlp.trees.CompositeTreeTransformer in project CoreNLP by stanfordnlp.
the class ShiftReduceParser method binarizeTreebank.
public static List<Tree> binarizeTreebank(Treebank treebank, Options op) {
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(op.tlpParams.headFinder(), op.tlpParams.treebankLanguagePack());
BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.langpack());
CompositeTreeTransformer transformer = new CompositeTreeTransformer();
transformer.addTransformer(binarizer);
transformer.addTransformer(basicTransformer);
treebank = treebank.transform(transformer);
HeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.headFinder());
List<Tree> binarizedTrees = Generics.newArrayList();
for (Tree tree : treebank) {
Trees.convertToCoreLabels(tree);
tree.percolateHeadAnnotations(binaryHeadFinder);
// Index from 1. Tools downstream expect index from 1, so for
// uses internal to the srparser we have to renormalize the
// indices, with the result that here we have to index from 1
tree.indexLeaves(1, true);
binarizedTrees.add(tree);
}
return binarizedTrees;
}
Aggregations