use of edu.stanford.nlp.util.HashIndex in project CoreNLP by stanfordnlp.
the class FactoredParser method main.
/* some documentation for Roger's convenience
* {pcfg,dep,combo}{PE,DE,TE} are precision/dep/tagging evals for the models
* parser is the PCFG parser
* dparser is the dependency parser
* bparser is the combining parser
* during testing:
* tree is the test tree (gold tree)
* binaryTree is the gold tree binarized
* tree2b is the best PCFG paser, binarized
* tree2 is the best PCFG parse (debinarized)
* tree3 is the dependency parse, binarized
* tree3db is the dependency parser, debinarized
* tree4 is the best combo parse, binarized and then debinarized
* tree4b is the best combo parse, binarized
*/
public static void main(String[] args) {
Options op = new Options(new EnglishTreebankParserParams());
// op.tlpParams may be changed to something else later, so don't use it till
// after options are parsed.
StringUtils.logInvocationString(log, args);
String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
String serializeFile = null;
int i = 0;
while (i < args.length && args[i].startsWith("-")) {
if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
path = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
trainLow = Integer.parseInt(args[i + 1]);
trainHigh = Integer.parseInt(args[i + 2]);
i += 3;
} else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
testLow = Integer.parseInt(args[i + 1]);
testHigh = Integer.parseInt(args[i + 2]);
i += 3;
} else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
serializeFile = args[i + 1];
i += 2;
} else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
try {
op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
} catch (ClassNotFoundException e) {
log.info("Class not found: " + args[i + 1]);
throw new RuntimeException(e);
} catch (InstantiationException e) {
log.info("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
log.info("illegal access" + e);
throw new RuntimeException(e);
}
i += 2;
} else if (args[i].equals("-encoding")) {
// sets encoding for TreebankLangParserParams
op.tlpParams.setInputEncoding(args[i + 1]);
op.tlpParams.setOutputEncoding(args[i + 1]);
i += 2;
} else {
i = op.setOptionOrWarn(args, i);
}
}
// System.out.println(tlpParams.getClass());
TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
// BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
PrintWriter pw = op.tlpParams.pw();
op.testOptions.display();
op.trainOptions.display();
op.display();
op.tlpParams.display();
// setup tree transforms
Treebank trainTreebank = op.tlpParams.memoryTreebank();
MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
// Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
// String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
// blippTreebank.loadPath(blippPath, "", true);
Timing.startTime();
log.info("Reading trees...");
testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
if (op.testOptions.increasingLength) {
Collections.sort(testTreebank, new TreeLengthComparator());
}
trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
Timing.tick("done.");
log.info("Binarizing trees...");
TreeAnnotatorAndBinarizer binarizer;
if (!op.trainOptions.leftToRight) {
binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
} else {
binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
}
CollinsPuncTransformer collinsPuncTransformer = null;
if (op.trainOptions.collinsPunc) {
collinsPuncTransformer = new CollinsPuncTransformer(tlp);
}
TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
List<Tree> binaryTrainTrees = new ArrayList<>();
if (op.trainOptions.selectiveSplit) {
op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
if (op.trainOptions.deleteSplitters != null) {
List<String> deleted = new ArrayList<>();
for (String del : op.trainOptions.deleteSplitters) {
String baseDel = tlp.basicCategory(del);
boolean checkBasic = del.equals(baseDel);
for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) {
String elem = it.next();
String baseElem = tlp.basicCategory(elem);
boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
if (delStr) {
it.remove();
deleted.add(elem);
}
}
}
log.info("Removed from vertical splitters: " + deleted);
}
}
if (op.trainOptions.selectivePostSplit) {
TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
Treebank annotatedTB = trainTreebank.transform(myTransformer);
op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack());
}
if (op.trainOptions.hSelSplit) {
binarizer.setDoSelectiveSplit(false);
for (Tree tree : trainTreebank) {
if (op.trainOptions.collinsPunc) {
tree = collinsPuncTransformer.transformTree(tree);
}
//tree.pennPrint(tlpParams.pw());
tree = binarizer.transformTree(tree);
//binaryTrainTrees.add(tree);
}
binarizer.setDoSelectiveSplit(true);
}
for (Tree tree : trainTreebank) {
if (op.trainOptions.collinsPunc) {
tree = collinsPuncTransformer.transformTree(tree);
}
tree = binarizer.transformTree(tree);
binaryTrainTrees.add(tree);
}
if (op.testOptions.verbose) {
binarizer.dumpStats();
}
List<Tree> binaryTestTrees = new ArrayList<>();
for (Tree tree : testTreebank) {
if (op.trainOptions.collinsPunc) {
tree = collinsPuncTransformer.transformTree(tree);
}
tree = binarizer.transformTree(tree);
binaryTestTrees.add(tree);
}
// binarization
Timing.tick("done.");
BinaryGrammar bg = null;
UnaryGrammar ug = null;
DependencyGrammar dg = null;
// DependencyGrammar dgBLIPP = null;
Lexicon lex = null;
Index<String> stateIndex = new HashIndex<>();
// extract grammars
Extractor<Pair<UnaryGrammar, BinaryGrammar>> bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
if (op.doPCFG) {
log.info("Extracting PCFG...");
Pair<UnaryGrammar, BinaryGrammar> bgug = null;
if (op.trainOptions.cheatPCFG) {
List<Tree> allTrees = new ArrayList<>(binaryTrainTrees);
allTrees.addAll(binaryTestTrees);
bgug = bgExtractor.extract(allTrees);
} else {
bgug = bgExtractor.extract(binaryTrainTrees);
}
bg = bgug.second;
bg.splitRules();
ug = bgug.first;
ug.purgeRules();
Timing.tick("done.");
}
log.info("Extracting Lexicon...");
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
lex = op.tlpParams.lex(op, wordIndex, tagIndex);
lex.initializeTraining(binaryTrainTrees.size());
lex.train(binaryTrainTrees);
lex.finishTraining();
Timing.tick("done.");
if (op.doDep) {
log.info("Extracting Dependencies...");
binaryTrainTrees.clear();
Extractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
// dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true));
// DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
//dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams));
//dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams));
// dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
//uses information whether the words are known or not, discards unknown words
dg = dgExtractor.extract(binaryTrainTrees);
Timing.tick("done.");
//System.out.print("Extracting Unknown Word Model...");
//UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
//Timing.tick("done.");
System.out.print("Tuning Dependency Model...");
dg.tune(binaryTestTrees);
//System.out.println("TUNE DEPS: "+tuneDeps);
Timing.tick("done.");
}
BinaryGrammar boundBG = bg;
UnaryGrammar boundUG = ug;
GrammarProjection gp = new NullGrammarProjection(bg, ug);
// serialization
if (serializeFile != null) {
log.info("Serializing parser...");
LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
parser.saveParserToSerialized(serializeFile);
Timing.tick("done.");
}
// test: pcfg-parse and output
ExhaustivePCFGParser parser = null;
if (op.doPCFG) {
parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
}
ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null);
Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null);
//Scorer scorer = parser;
BiLexPCFGParser bparser = null;
if (op.doPCFG && op.doDep) {
bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex);
}
Evalb pcfgPE = new Evalb("pcfg PE", true);
Evalb comboPE = new Evalb("combo PE", true);
AbstractEval pcfgCB = new Evalb.CBEval("pcfg CB", true);
AbstractEval pcfgTE = new TaggingEval("pcfg TE");
AbstractEval comboTE = new TaggingEval("combo TE");
AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE");
AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
AbstractEval depTE = new TaggingEval("depnd TE");
AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter());
AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter());
if (op.testOptions.evalb) {
EvalbFormatWriter.initEVALBfiles(op.tlpParams);
}
// int[] countByLength = new int[op.testOptions.maxLength+1];
// Use a reflection ruse, so one can run this without needing the
// tagger. Using a function rather than a MaxentTagger means we
// can distribute a version of the parser that doesn't include the
// entire tagger.
Function<List<? extends HasWord>, ArrayList<TaggedWord>> tagger = null;
if (op.testOptions.preTag) {
try {
Class[] argsClass = { String.class };
Object[] arguments = new Object[] { op.testOptions.taggerSerializedFile };
tagger = (Function<List<? extends HasWord>, ArrayList<TaggedWord>>) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger").getConstructor(argsClass).newInstance(arguments);
} catch (Exception e) {
log.info(e);
log.info("Warning: No pretagging of sentences will be done.");
}
}
for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
Tree tree = testTreebank.get(tNum);
int testTreeLen = tree.yield().size();
if (testTreeLen > op.testOptions.maxLength) {
continue;
}
Tree binaryTree = binaryTestTrees.get(tNum);
// countByLength[testTreeLen]++;
System.out.println("-------------------------------------");
System.out.println("Number: " + (tNum + 1));
System.out.println("Length: " + testTreeLen);
//tree.pennPrint(pw);
// System.out.println("XXXX The binary tree is");
// binaryTree.pennPrint(pw);
//System.out.println("Here are the tags in the lexicon:");
//System.out.println(lex.showTags());
//System.out.println("Here's the tagnumberer:");
//System.out.println(Numberer.getGlobalNumberer("tags").toString());
long timeMil1 = System.currentTimeMillis();
Timing.tick("Starting parse.");
if (op.doPCFG) {
//log.info(op.testOptions.forceTags);
if (op.testOptions.forceTags) {
if (tagger != null) {
//System.out.println("Using a tagger to set tags");
//System.out.println("Tagged sentence as: " + tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
parser.parse(addLast(tagger.apply(cutLast(wordify(binaryTree.yield())))));
} else {
//System.out.println("Forcing tags to match input.");
parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
}
} else {
// System.out.println("XXXX Parsing " + binaryTree.yield());
parser.parse(binaryTree.yieldHasWord());
}
//Timing.tick("Done with pcfg phase.");
}
if (op.doDep) {
dparser.parse(binaryTree.yieldHasWord());
//Timing.tick("Done with dependency phase.");
}
boolean bothPassed = false;
if (op.doPCFG && op.doDep) {
bothPassed = bparser.parse(binaryTree.yieldHasWord());
//Timing.tick("Done with combination phase.");
}
long timeMil2 = System.currentTimeMillis();
long elapsed = timeMil2 - timeMil1;
log.info("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
//System.out.println("PCFG Best Parse:");
Tree tree2b = null;
Tree tree2 = null;
//System.out.println("Got full best parse...");
if (op.doPCFG) {
tree2b = parser.getBestParse();
tree2 = debinarizer.transformTree(tree2b);
}
//System.out.println("Debinarized parse...");
//tree2.pennPrint();
//System.out.println("DepG Best Parse:");
Tree tree3 = null;
Tree tree3db = null;
if (op.doDep) {
tree3 = dparser.getBestParse();
// was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
tree3db = debinarizer.transformTree(tree3);
tree3.pennPrint(pw);
}
//tree.pennPrint();
//((Tree)binaryTrainTrees.get(tNum)).pennPrint();
//System.out.println("Combo Best Parse:");
Tree tree4 = null;
if (op.doPCFG && op.doDep) {
try {
tree4 = bparser.getBestParse();
if (tree4 == null) {
tree4 = tree2b;
}
} catch (NullPointerException e) {
log.info("Blocked, using PCFG parse!");
tree4 = tree2b;
}
}
if (op.doPCFG && !bothPassed) {
tree4 = tree2b;
}
//tree4.pennPrint();
if (op.doDep) {
depDE.evaluate(tree3, binaryTree, pw);
depTE.evaluate(tree3db, tree, pw);
}
TreeTransformer tc = op.tlpParams.collinizer();
TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
if (op.doPCFG) {
// System.out.println("XXXX Best PCFG was: ");
// tree2.pennPrint();
// System.out.println("XXXX Transformed best PCFG is: ");
// tc.transformTree(tree2).pennPrint();
//System.out.println("True Best Parse:");
//tree.pennPrint();
//tc.transformTree(tree).pennPrint();
pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
Tree tree4b = null;
if (op.doDep) {
comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
tree4b = tree4;
tree4 = debinarizer.transformTree(tree4);
if (op.nodePrune) {
NodePruner np = new NodePruner(parser, debinarizer);
tree4 = np.prune(tree4);
}
//tree4.pennPrint();
comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
}
//pcfgTE.evaluate(tree2, tree);
pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
if (op.doDep) {
comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
}
System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));
//tc.transformTree(tree2).pennPrint();
tree2.pennPrint(pw);
if (op.doDep) {
System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
// tc.transformTree(tree4).pennPrint(pw);
tree4.pennPrint(pw);
}
System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
/*
if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
System.out.println("SCORE INVERSION");
parser.validateBinarizedTree(binaryTree,0);
}
*/
tree.pennPrint(pw);
}
if (op.testOptions.evalb) {
if (op.doPCFG && op.doDep) {
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
} else if (op.doPCFG) {
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
} else if (op.doDep) {
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
}
}
}
if (op.testOptions.evalb) {
EvalbFormatWriter.closeEVALBfiles();
}
// op.testOptions.display();
if (op.doPCFG) {
pcfgPE.display(false, pw);
System.out.println("Grammar size: " + stateIndex.size());
pcfgCB.display(false, pw);
if (op.doDep) {
comboPE.display(false, pw);
}
pcfgTE.display(false, pw);
pcfgTEnoPunct.display(false, pw);
if (op.doDep) {
comboTE.display(false, pw);
comboTEnoPunct.display(false, pw);
}
}
if (op.doDep) {
depTE.display(false, pw);
depDE.display(false, pw);
}
if (op.doPCFG && op.doDep) {
comboDE.display(false, pw);
}
// pcfgPE.printGoodBad();
}
use of edu.stanford.nlp.util.HashIndex in project CoreNLP by stanfordnlp.
the class ChineseCharacterBasedLexiconTraining method main.
public static void main(String[] args) throws IOException {
Map<String, Integer> flagsToNumArgs = Generics.newHashMap();
flagsToNumArgs.put("-parser", Integer.valueOf(3));
flagsToNumArgs.put("-lex", Integer.valueOf(3));
flagsToNumArgs.put("-test", Integer.valueOf(2));
flagsToNumArgs.put("-out", Integer.valueOf(1));
flagsToNumArgs.put("-lengthPenalty", Integer.valueOf(1));
flagsToNumArgs.put("-penaltyType", Integer.valueOf(1));
flagsToNumArgs.put("-maxLength", Integer.valueOf(1));
flagsToNumArgs.put("-stats", Integer.valueOf(2));
Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs);
boolean eval = argMap.containsKey("-eval");
PrintWriter pw = null;
if (argMap.containsKey("-out")) {
pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap.get("-out"))[0]), "GB18030"), true);
}
log.info("ChineseCharacterBasedLexicon called with args:");
ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
for (int i = 0; i < args.length; i++) {
ctpp.setOptionFlag(args, i);
log.info(" " + args[i]);
}
log.info();
Options op = new Options(ctpp);
if (argMap.containsKey("-stats")) {
String[] statArgs = (argMap.get("-stats"));
MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
FileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false);
rawTrainTreebank.loadPath(new File(statArgs[0]), trainFilt);
log.info("Done reading trees.");
MemoryTreebank trainTreebank;
if (argMap.containsKey("-annotate")) {
trainTreebank = new MemoryTreebank();
TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
for (Tree tree : rawTrainTreebank) {
trainTreebank.add(annotator.transformTree(tree));
}
log.info("Done annotating trees.");
} else {
trainTreebank = rawTrainTreebank;
}
printStats(trainTreebank, pw);
System.exit(0);
}
int maxLength = 1000000;
// Test.verbose = true;
if (argMap.containsKey("-norm")) {
op.testOptions.lengthNormalization = true;
}
if (argMap.containsKey("-maxLength")) {
maxLength = Integer.parseInt((argMap.get("-maxLength"))[0]);
}
op.testOptions.maxLength = 120;
boolean combo = argMap.containsKey("-combo");
if (combo) {
ctpp.useCharacterBasedLexicon = true;
op.testOptions.maxSpanForTags = 10;
op.doDep = false;
op.dcTags = false;
}
LexicalizedParser lp = null;
Lexicon lex = null;
if (argMap.containsKey("-parser")) {
String[] parserArgs = (argMap.get("-parser"));
if (parserArgs.length > 1) {
FileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
lp = LexicalizedParser.trainFromTreebank(parserArgs[0], trainFilt, op);
if (parserArgs.length == 3) {
String filename = parserArgs[2];
log.info("Writing parser in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lp);
out.close();
log.info("done.");
}
} else {
String parserFile = parserArgs[0];
lp = LexicalizedParser.loadModel(parserFile, op);
}
lex = lp.getLexicon();
op = lp.getOp();
ctpp = (ChineseTreebankParserParams) op.tlpParams;
}
if (argMap.containsKey("-rad")) {
ctpp.useUnknownCharacterModel = true;
}
if (argMap.containsKey("-lengthPenalty")) {
ctpp.lengthPenalty = Double.parseDouble((argMap.get("-lengthPenalty"))[0]);
}
if (argMap.containsKey("-penaltyType")) {
ctpp.penaltyType = Integer.parseInt((argMap.get("-penaltyType"))[0]);
}
if (argMap.containsKey("-lex")) {
String[] lexArgs = (argMap.get("-lex"));
if (lexArgs.length > 1) {
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
lex = ctpp.lex(op, wordIndex, tagIndex);
MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
FileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false);
rawTrainTreebank.loadPath(new File(lexArgs[0]), trainFilt);
log.info("Done reading trees.");
MemoryTreebank trainTreebank;
if (argMap.containsKey("-annotate")) {
trainTreebank = new MemoryTreebank();
TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
for (Tree tree : rawTrainTreebank) {
tree = annotator.transformTree(tree);
trainTreebank.add(tree);
}
log.info("Done annotating trees.");
} else {
trainTreebank = rawTrainTreebank;
}
lex.initializeTraining(trainTreebank.size());
lex.train(trainTreebank);
lex.finishTraining();
log.info("Done training lexicon.");
if (lexArgs.length == 3) {
String filename = lexArgs.length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
log.info("Writing lexicon in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lex);
out.close();
log.info("done.");
}
} else {
String lexFile = lexArgs.length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
log.info("Reading Lexicon from file " + lexFile);
ObjectInputStream in = IOUtils.readStreamFromString(lexFile);
try {
lex = (Lexicon) in.readObject();
} catch (ClassNotFoundException e) {
throw new RuntimeException("Bad serialized file: " + lexFile);
}
in.close();
}
}
if (argMap.containsKey("-test")) {
boolean segmentWords = ctpp.segment;
boolean parse = lp != null;
assert (parse || segmentWords);
// WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
// WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
WordSegmenter seg = null;
if (segmentWords) {
seg = (WordSegmenter) lex;
}
String[] testArgs = (argMap.get("-test"));
MemoryTreebank testTreebank = op.tlpParams.memoryTreebank();
FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
testTreebank.loadPath(new File(testArgs[0]), testFilt);
TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
TreeTransformer collinizer = ctpp.collinizer();
WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic");
EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
List<String> evalTypes = new ArrayList<>(3);
boolean goodPOS = false;
if (segmentWords) {
evalTypes.add(WordCatConstituent.wordType);
if (ctpp.segmentMarkov && !parse) {
evalTypes.add(WordCatConstituent.tagType);
goodPOS = true;
}
}
if (parse) {
evalTypes.add(WordCatConstituent.tagType);
evalTypes.add(WordCatConstituent.catType);
if (combo) {
evalTypes.add(WordCatConstituent.wordType);
goodPOS = true;
}
}
TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
log.info("Testing...");
for (Tree goldTop : testTreebank) {
Tree gold = goldTop.firstChild();
List<HasWord> goldSentence = gold.yieldHasWord();
if (goldSentence.size() > maxLength) {
log.info("Skipping sentence; too long: " + goldSentence.size());
continue;
} else {
log.info("Processing sentence; length: " + goldSentence.size());
}
List<HasWord> s;
if (segmentWords) {
StringBuilder goldCharBuf = new StringBuilder();
for (HasWord aGoldSentence : goldSentence) {
StringLabel word = (StringLabel) aGoldSentence;
goldCharBuf.append(word.value());
}
String goldChars = goldCharBuf.toString();
s = seg.segment(goldChars);
} else {
s = goldSentence;
}
Tree tree;
if (parse) {
tree = lp.parseTree(s);
if (tree == null) {
throw new RuntimeException("PARSER RETURNED NULL!!!");
}
} else {
tree = Trees.toFlatTree(s);
tree = subcategoryStripper.transformTree(tree);
}
if (pw != null) {
if (parse) {
tree.pennPrint(pw);
} else {
Iterator sentIter = s.iterator();
for (; ; ) {
Word word = (Word) sentIter.next();
pw.print(word.word());
if (sentIter.hasNext()) {
pw.print(" ");
} else {
break;
}
}
}
pw.println();
}
if (eval) {
Collection ourBrackets, goldBrackets;
ourBrackets = proc.allBrackets(tree);
goldBrackets = proc.allBrackets(gold);
if (goodPOS) {
ourBrackets.addAll(proc.commonWordTagTypeBrackets(tree, gold));
goldBrackets.addAll(proc.commonWordTagTypeBrackets(gold, tree));
}
basicEval.eval(ourBrackets, goldBrackets);
System.out.println("\nScores:");
basicEval.displayLast();
Tree collinsTree = collinizer.transformTree(tree);
Tree collinsGold = collinizer.transformTree(gold);
ourBrackets = proc.allBrackets(collinsTree);
goldBrackets = proc.allBrackets(collinsGold);
if (goodPOS) {
ourBrackets.addAll(proc.commonWordTagTypeBrackets(collinsTree, collinsGold));
goldBrackets.addAll(proc.commonWordTagTypeBrackets(collinsGold, collinsTree));
}
collinsEval.eval(ourBrackets, goldBrackets);
System.out.println("\nCollinized scores:");
collinsEval.displayLast();
System.out.println();
}
}
if (eval) {
basicEval.display();
System.out.println();
collinsEval.display();
}
}
}
use of edu.stanford.nlp.util.HashIndex in project CoreNLP by stanfordnlp.
the class ChineseLexiconAndWordSegmenter method main.
/** This method lets you train and test a segmenter relative to a
* Treebank.
* <p>
* <i>Implementation note:</i> This method is largely cloned from
* LexicalizedParser's main method. Should we try to have it be able
* to train segmenters to stop things going out of sync?
*/
public static void main(String[] args) {
boolean train = false;
boolean saveToSerializedFile = false;
boolean saveToTextFile = false;
String serializedInputFileOrUrl = null;
String textInputFileOrUrl = null;
String serializedOutputFileOrUrl = null;
String textOutputFileOrUrl = null;
String treebankPath = null;
Treebank testTreebank = null;
// Treebank tuneTreebank = null;
String testPath = null;
FileFilter testFilter = null;
FileFilter trainFilter = null;
String encoding = null;
// variables needed to process the files to be parsed
TokenizerFactory<Word> tokenizerFactory = null;
// DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
// whether or not the input file has already been tokenized
boolean tokenized = false;
Function<List<HasWord>, List<HasWord>> escaper = new ChineseEscaper();
// int tagDelimiter = -1;
// String sentenceDelimiter = "\n";
// boolean fromXML = false;
int argIndex = 0;
if (args.length < 1) {
log.info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
return;
}
Options op = new Options();
op.tlpParams = new ChineseTreebankParserParams();
// while loop through option arguments
while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
if (args[argIndex].equalsIgnoreCase("-train")) {
train = true;
saveToSerializedFile = true;
int numSubArgs = numSubArgs(args, argIndex);
argIndex++;
if (numSubArgs > 1) {
treebankPath = args[argIndex];
argIndex++;
} else {
throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
}
if (numSubArgs == 2) {
trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs >= 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
trainFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
trainFilter = new NumberRangesFileFilter(args[argIndex], true);
argIndex++;
}
}
} else if (args[argIndex].equalsIgnoreCase("-encoding")) {
// sets encoding for TreebankLangParserParams
encoding = args[argIndex + 1];
op.tlpParams.setInputEncoding(encoding);
op.tlpParams.setOutputEncoding(encoding);
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
// load the parser from a binary serialized file
// the next argument must be the path to the parser file
serializedInputFileOrUrl = args[argIndex + 1];
argIndex += 2;
// doesn't make sense to load from TextFile -pichuan
// } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
// // load the parser from declarative text file
// // the next argument must be the path to the parser file
// textInputFileOrUrl = args[argIndex + 1];
// argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
saveToSerializedFile = true;
serializedOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
// save the parser to declarative text file
saveToTextFile = true;
textOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-treebank")) {
// the next argument is the treebank path and range for testing
int numSubArgs = numSubArgs(args, argIndex);
argIndex++;
if (numSubArgs == 1) {
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs > 1) {
testPath = args[argIndex++];
if (numSubArgs == 2) {
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs >= 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
testFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
}
}
}
} else {
int j = op.tlpParams.setOptionFlag(args, argIndex);
if (j == argIndex) {
log.info("Unknown option ignored: " + args[argIndex]);
j++;
}
argIndex = j;
}
}
// end while loop through arguments
TreebankLangParserParams tlpParams = op.tlpParams;
// all other arguments are order dependent and
// are processed in order below
ChineseLexiconAndWordSegmenter cs = null;
if (!train && op.testOptions.verbose) {
System.out.println("Currently " + new Date());
printArgs(args, System.out);
}
if (train) {
printArgs(args, System.out);
// so we train a parser using the treebank
if (treebankPath == null) {
// the next arg must be the treebank path, since it wasn't give earlier
treebankPath = args[argIndex];
argIndex++;
if (args.length > argIndex + 1) {
try {
// the next two args might be the range
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
trainFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
trainFilter = new NumberRangesFileFilter(args[argIndex], true);
argIndex++;
}
}
}
Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
} else if (textInputFileOrUrl != null) {
// so we load the segmenter from a text grammar file
// XXXXX fix later -pichuan
//cs = new LexicalizedParser(textInputFileOrUrl, true, op);
} else {
// so we load a serialized segmenter
if (serializedInputFileOrUrl == null) {
// the next argument must be the path to the serialized parser
serializedInputFileOrUrl = args[argIndex];
argIndex++;
}
try {
cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
} catch (IllegalArgumentException e) {
log.info("Error loading segmenter, exiting...");
System.exit(0);
}
}
// the following has to go after reading parser to make sure
// op and tlpParams are the same for train and test
TreePrint treePrint = op.testOptions.treePrint(tlpParams);
if (testFilter != null) {
if (testPath == null) {
if (treebankPath == null) {
throw new RuntimeException("No test treebank path specified...");
} else {
log.info("No test treebank path specified. Using train path: \"" + treebankPath + "\"");
testPath = treebankPath;
}
}
testTreebank = tlpParams.testMemoryTreebank();
testTreebank.loadPath(testPath, testFilter);
}
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));
// -- Roger
if (op.testOptions.verbose) {
log.info("Lexicon is " + cs.getClass().getName());
}
PrintWriter pwOut = tlpParams.pw();
PrintWriter pwErr = tlpParams.pw(System.err);
// Now what do we do with the parser we've made
if (saveToTextFile) {
// save the parser to textGrammar format
if (textOutputFileOrUrl != null) {
saveSegmenterDataToText(cs, textOutputFileOrUrl);
} else {
log.info("Usage: must specify a text segmenter data output path");
}
}
if (saveToSerializedFile) {
if (serializedOutputFileOrUrl == null && argIndex < args.length) {
// the next argument must be the path to serialize to
serializedOutputFileOrUrl = args[argIndex];
argIndex++;
}
if (serializedOutputFileOrUrl != null) {
saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
} else if (textOutputFileOrUrl == null && testTreebank == null) {
// no saving/parsing request has been specified
log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
}
}
/* --------------------- Testing part!!!! ----------------------- */
if (op.testOptions.verbose) {
// printOptions(false, op);
}
if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
// test parser on treebank
if (testTreebank == null) {
// the next argument is the treebank path and range for testing
testTreebank = tlpParams.testMemoryTreebank();
if (args.length < argIndex + 4) {
testTreebank.loadPath(args[argIndex + 1]);
} else {
int testlow = Integer.parseInt(args[argIndex + 2]);
int testhigh = Integer.parseInt(args[argIndex + 3]);
testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
}
}
/* TODO - test segmenting on treebank. -pichuan */
// lp.testOnTreebank(testTreebank);
// } else if (argIndex >= args.length) {
// // no more arguments, so we just parse our own test sentence
// if (lp.parse(op.tlpParams.defaultTestSentence())) {
// treePrint.printTree(lp.getBestParse(), pwOut);
// } else {
// pwErr.println("Error. Can't parse test sentence: " +
// lp.parse(op.tlpParams.defaultTestSentence()));
// }
}
//wsg2010: This code block doesn't actually do anything. It appears to read and tokenize a file, and then just print it.
// There are easier ways to do that. This code was copied from an old version of LexicalizedParser.
// else {
// // We parse filenames given by the remaining arguments
// int numWords = 0;
// Timing timer = new Timing();
// // set the tokenizer
// if (tokenized) {
// tokenizerFactory = WhitespaceTokenizer.factory();
// }
// TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
// if (tokenizerFactory == null) {
// tokenizerFactory = (TokenizerFactory<Word>) tlp.getTokenizerFactory();
// }
// documentPreprocessor.setTokenizerFactory(tokenizerFactory);
// documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
// if (encoding != null) {
// documentPreprocessor.setEncoding(encoding);
// }
// timer.start();
// for (int i = argIndex; i < args.length; i++) {
// String filename = args[i];
// try {
// List document = null;
// if (fromXML) {
// document = documentPreprocessor.getSentencesFromXML(filename, sentenceDelimiter, tokenized);
// } else {
// document = documentPreprocessor.getSentencesFromText(filename, escaper, sentenceDelimiter, tagDelimiter);
// }
// log.info("Segmenting file: " + filename + " with " + document.size() + " sentences.");
// PrintWriter pwo = pwOut;
// if (op.testOptions.writeOutputFiles) {
// try {
// pwo = tlpParams.pw(new FileOutputStream(filename + ".stp"));
// } catch (IOException ioe) {
// ioe.printStackTrace();
// }
// }
// int num = 0;
// treePrint.printHeader(pwo, tlp.getEncoding());
// for (Iterator it = document.iterator(); it.hasNext();) {
// num++;
// List sentence = (List) it.next();
// int len = sentence.size();
// numWords += len;
//// pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + sentence);
// pwo.println(Sentence.listToString(sentence));
// }
// treePrint.printFooter(pwo);
// if (op.testOptions.writeOutputFiles) {
// pwo.close();
// }
// } catch (IOException e) {
// pwErr.println("Couldn't find file: " + filename);
// }
//
// } // end for each file
// long millis = timer.stop();
// double wordspersec = numWords / (((double) millis) / 1000);
// NumberFormat nf = new DecimalFormat("0.00"); // easier way!
// pwErr.println("Segmented " + numWords + " words at " + nf.format(wordspersec) + " words per second.");
// }
}
use of edu.stanford.nlp.util.HashIndex in project CoreNLP by stanfordnlp.
the class UNKPrinter method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
Language lang = Language.English;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i]);
}
}
PrintWriter pw = tlpp.pw();
Options op = new Options();
Options.LexOptions lexOptions = op.lexOptions;
if (lang == Language.French) {
lexOptions.useUnknownWordSignatures = 1;
lexOptions.smartMutation = false;
lexOptions.unknownSuffixSize = 2;
lexOptions.unknownPrefixSize = 1;
} else if (lang == Language.Arabic) {
lexOptions.smartMutation = false;
lexOptions.useUnknownWordSignatures = 9;
lexOptions.unknownPrefixSize = 1;
lexOptions.unknownSuffixSize = 1;
}
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
int computeAfter = (int) (0.50 * tb.size());
Counter<String> vocab = new ClassicCounter<>();
Counter<String> unkCounter = new ClassicCounter<>();
int treeId = 0;
for (Tree t : tb) {
List<Label> yield = t.yield();
int posId = 0;
for (Label word : yield) {
vocab.incrementCount(word.value());
if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
// if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
// pw.println(word.value());
unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
}
treeId++;
}
List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
pw.close();
pw.close();
}
use of edu.stanford.nlp.util.HashIndex in project CoreNLP by stanfordnlp.
the class ShiftReduceParser method train.
private void train(List<Pair<String, FileFilter>> trainTreebankPath, Pair<String, FileFilter> devTreebankPath, String serializedPath) {
log.info("Training method: " + op.trainOptions().trainingMethod);
List<Tree> binarizedTrees = Generics.newArrayList();
for (Pair<String, FileFilter> treebank : trainTreebankPath) {
binarizedTrees.addAll(readBinarizedTreebank(treebank.first(), treebank.second()));
}
int nThreads = op.trainOptions.trainingThreads;
nThreads = nThreads <= 0 ? Runtime.getRuntime().availableProcessors() : nThreads;
Tagger tagger = null;
if (op.testOptions.preTag) {
Timing retagTimer = new Timing();
tagger = Tagger.loadModel(op.testOptions.taggerSerializedFile);
redoTags(binarizedTrees, tagger, nThreads);
retagTimer.done("Retagging");
}
Set<String> knownStates = findKnownStates(binarizedTrees);
Set<String> rootStates = findRootStates(binarizedTrees);
Set<String> rootOnlyStates = findRootOnlyStates(binarizedTrees, rootStates);
log.info("Known states: " + knownStates);
log.info("States which occur at the root: " + rootStates);
log.info("States which only occur at the root: " + rootStates);
Timing transitionTimer = new Timing();
List<List<Transition>> transitionLists = CreateTransitionSequence.createTransitionSequences(binarizedTrees, op.compoundUnaries, rootStates, rootOnlyStates);
Index<Transition> transitionIndex = new HashIndex<>();
for (List<Transition> transitions : transitionLists) {
transitionIndex.addAll(transitions);
}
transitionTimer.done("Converting trees into transition lists");
log.info("Number of transitions: " + transitionIndex.size());
Random random = new Random(op.trainOptions.randomSeed);
Treebank devTreebank = null;
if (devTreebankPath != null) {
devTreebank = readTreebank(devTreebankPath.first(), devTreebankPath.second());
}
PerceptronModel newModel = new PerceptronModel(this.op, transitionIndex, knownStates, rootStates, rootOnlyStates);
newModel.trainModel(serializedPath, tagger, random, binarizedTrees, transitionLists, devTreebank, nThreads);
this.model = newModel;
}
Aggregations