use of edu.stanford.nlp.parser.lexparser.Options in project CoreNLP by stanfordnlp.
the class CharacterLevelTagExtender method main.
/**
* for testing -- CURRENTLY BROKEN!!!
*
* @param args input dir and output filename
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length != 3) {
throw new RuntimeException("args: treebankPath trainNums testNums");
}
ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
ctpp.charTags = true;
// TODO: these options are getting clobbered by reading in the
// parser object (unless it's a text file parser?)
Options op = new Options(ctpp);
op.doDep = false;
op.testOptions.maxLength = 90;
LexicalizedParser lp;
try {
FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);
lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
try {
String filename = "chineseCharTagPCFG.ser.gz";
log.info("Writing parser in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lp);
out.close();
log.info("done.");
} catch (IOException ioe) {
ioe.printStackTrace();
}
} catch (IllegalArgumentException e) {
lp = LexicalizedParser.loadModel(args[1], op);
}
FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
MemoryTreebank testTreebank = ctpp.memoryTreebank();
testTreebank.loadPath(new File(args[0]), testFilt);
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
// System.out.println("Preterminals:" + preterminals);
System.out.println("Testing...");
for (Tree gold : testTreebank) {
Tree tree;
try {
tree = lp.parseTree(gold.yieldHasWord());
if (tree == null) {
System.out.println("Failed to parse " + gold.yieldHasWord());
continue;
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
gold = gold.firstChild();
pw.println(SentenceUtils.listToString(gold.preTerminalYield()));
pw.println(SentenceUtils.listToString(gold.yield()));
gold.pennPrint(pw);
pw.println(tree.preTerminalYield());
pw.println(tree.yield());
tree.pennPrint(pw);
// Collection allBrackets = WordCatConstituent.allBrackets(tree);
// Collection goldBrackets = WordCatConstituent.allBrackets(gold);
// eval.eval(allBrackets, goldBrackets);
eval.displayLast();
}
System.out.println();
System.out.println();
eval.display();
}
use of edu.stanford.nlp.parser.lexparser.Options in project CoreNLP by stanfordnlp.
the class ParserGrammar method loadTagger.
public Function<List<? extends HasWord>, List<TaggedWord>> loadTagger() {
Options op = getOp();
if (op.testOptions.preTag) {
synchronized (this) {
// TODO: rather coarse synchronization
if (!op.testOptions.taggerSerializedFile.equals(taggerPath)) {
taggerPath = op.testOptions.taggerSerializedFile;
tagger = ReflectionLoading.loadByReflection("edu.stanford.nlp.tagger.maxent.MaxentTagger", taggerPath);
}
return tagger;
}
} else {
return null;
}
}
use of edu.stanford.nlp.parser.lexparser.Options in project CoreNLP by stanfordnlp.
the class ReorderingOracleTest method setUp.
public void setUp() {
Options op = new Options();
Treebank treebank = op.tlpParams.memoryTreebank();
treebank.addAll(Arrays.asList(correctTrees));
binarizedTrees = ShiftReduceParser.binarizeTreebank(treebank, op);
}
use of edu.stanford.nlp.parser.lexparser.Options in project CoreNLP by stanfordnlp.
the class CombineDVModels method main.
public static void main(String[] args) throws IOException, ClassNotFoundException {
String modelPath = null;
List<String> baseModelPaths = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = new ArrayList<>();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-baseModels")) {
argIndex++;
baseModelPaths = new ArrayList<>();
while (argIndex < args.length && args[argIndex].charAt(0) != '-') {
baseModelPaths.add(args[argIndex++]);
}
if (baseModelPaths.size() == 0) {
throw new IllegalArgumentException("Found an argument -baseModels with no actual models named");
}
} else {
unusedArgs.add(args[argIndex++]);
}
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser underlyingParser = null;
Options options = null;
LexicalizedParser combinedParser = null;
if (baseModelPaths != null) {
List<DVModel> dvparsers = new ArrayList<>();
for (String baseModelPath : baseModelPaths) {
log.info("Loading serialized DVParser from " + baseModelPath);
LexicalizedParser dvparser = LexicalizedParser.loadModel(baseModelPath);
Reranker reranker = dvparser.reranker;
if (!(reranker instanceof DVModelReranker)) {
throw new IllegalArgumentException("Expected parsers with DVModel embedded");
}
dvparsers.add(((DVModelReranker) reranker).getModel());
if (underlyingParser == null) {
underlyingParser = dvparser;
options = underlyingParser.getOp();
// TODO: other parser's options?
options.setOptions(newArgs);
}
log.info("... done");
}
combinedParser = LexicalizedParser.copyLexicalizedParser(underlyingParser);
CombinedDVModelReranker reranker = new CombinedDVModelReranker(options, dvparsers);
combinedParser.reranker = reranker;
combinedParser.saveParserToSerialized(modelPath);
} else {
throw new IllegalArgumentException("Need to specify -model to load an already prepared CombinedParser");
}
Treebank testTreebank = null;
if (testTreebankPath != null) {
log.info("Reading in trees from " + testTreebankPath);
if (testTreebankFilter != null) {
log.info("Filtering on " + testTreebankFilter);
}
testTreebank = combinedParser.getOp().tlpParams.memoryTreebank();
;
testTreebank.loadPath(testTreebankPath, testTreebankFilter);
log.info("Read in " + testTreebank.size() + " trees for testing");
EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.getOp(), null, combinedParser);
evaluator.testOnTreebank(testTreebank);
}
}
use of edu.stanford.nlp.parser.lexparser.Options in project CoreNLP by stanfordnlp.
the class UNKPrinter method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
Language lang = Language.English;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
PrintWriter pw = tlpp.pw();
Options op = new Options();
Options.LexOptions lexOptions = op.lexOptions;
if (lang == Language.French) {
lexOptions.useUnknownWordSignatures = 1;
lexOptions.smartMutation = false;
lexOptions.unknownSuffixSize = 2;
lexOptions.unknownPrefixSize = 1;
} else if (lang == Language.Arabic) {
lexOptions.smartMutation = false;
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
}
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
int computeAfter = (int) (0.50 * tb.size());
Counter<String> vocab = new ClassicCounter<>();
Counter<String> unkCounter = new ClassicCounter<>();
int treeId = 0;
for (Tree t : tb) {
List<Label> yield = t.yield();
int posId = 0;
for (Label word : yield) {
vocab.incrementCount(word.value());
if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
// if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
// pw.println(word.value());
unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
}
treeId++;
}
List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
pw.close();
pw.close();
}
Aggregations