use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.
the class CharacterLevelTagExtender method main.
/**
* for testing -- CURRENTLY BROKEN!!!
*
* @param args input dir and output filename
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length != 3) {
throw new RuntimeException("args: treebankPath trainNums testNums");
}
ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
ctpp.charTags = true;
// TODO: these options are getting clobbered by reading in the
// parser object (unless it's a text file parser?)
Options op = new Options(ctpp);
op.doDep = false;
op.testOptions.maxLength = 90;
LexicalizedParser lp;
try {
FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);
lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
try {
String filename = "chineseCharTagPCFG.ser.gz";
log.info("Writing parser in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lp);
out.close();
log.info("done.");
} catch (IOException ioe) {
ioe.printStackTrace();
}
} catch (IllegalArgumentException e) {
lp = LexicalizedParser.loadModel(args[1], op);
}
FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
MemoryTreebank testTreebank = ctpp.memoryTreebank();
testTreebank.loadPath(new File(args[0]), testFilt);
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
// System.out.println("Preterminals:" + preterminals);
System.out.println("Testing...");
for (Tree gold : testTreebank) {
Tree tree;
try {
tree = lp.parseTree(gold.yieldHasWord());
if (tree == null) {
System.out.println("Failed to parse " + gold.yieldHasWord());
continue;
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
gold = gold.firstChild();
pw.println(SentenceUtils.listToString(gold.preTerminalYield()));
pw.println(SentenceUtils.listToString(gold.yield()));
gold.pennPrint(pw);
pw.println(tree.preTerminalYield());
pw.println(tree.yield());
tree.pennPrint(pw);
// Collection allBrackets = WordCatConstituent.allBrackets(tree);
// Collection goldBrackets = WordCatConstituent.allBrackets(gold);
// eval.eval(allBrackets, goldBrackets);
eval.displayLast();
}
System.out.println();
System.out.println();
eval.display();
}
use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.
the class TaggerParserPosTagCompatibilityITest method testTagSet4.
private static void testTagSet4(String[] lexParsers, String[] maxentTaggers, String[] srParsers, String[] nnDepParsers) {
LexicalizedParser lp = LexicalizedParser.loadModel(lexParsers[0]);
Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
for (String name : maxentTaggers) {
MaxentTagger tagger = new MaxentTagger(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, tagger.tagSet()) + "; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n", tagSet, tagger.tagSet());
}
for (String name : lexParsers) {
LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) + "; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n", tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
}
for (String name : srParsers) {
ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, srp.tagSet()) + "; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n", tagSet, srp.tagSet());
}
for (String name : nnDepParsers) {
DependencyParser dp = DependencyParser.loadFromModelFile(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, dp.getPosSet()) + "; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n", tagSet, dp.getPosSet());
}
}
use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.
the class BuildBinarizedDataset method main.
/**
* Turns a text file into trees for use in a RNTN classifier such as
* the treebank used in the Sentiment project.
* <br>
* The expected input file is one sentence per line, with sentences
* separated by blank lines. The first line has the main label of the sentence together with the full sentence.
* Lines after the first sentence line but before
* the blank line will be treated as labeled sub-phrases. The
* labels should start with the label and then contain a list of
* tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
* For example:
* <br>
* <code>
* 1 Today is not a good day.<br>
* 3 good<br>
* 3 good day <br>
* 3 a good day <br>
* <br>
* (next block starts here) <br>
* </code>
* By default the englishPCFG parser is used. This can be changed
* with the <code>-parserModel</code> flag. Specify an input file
* with <code>-input</code>.
* <br>
* If a sentiment model is provided with -sentimentModel, that model
* will be used to prelabel the sentences. Any spans with given
* labels will then be used to adjust those labels.
*/
public static void main(String[] args) {
CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
String inputPath = null;
String sentimentModelPath = null;
SentimentModel sentimentModel = null;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
parserModel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
sentimentModelPath = args[argIndex + 1];
argIndex += 2;
} else {
log.info("Unknown argument " + args[argIndex]);
System.exit(2);
}
}
if (inputPath == null) {
throw new IllegalArgumentException("Must specify input file with -input");
}
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
if (sentimentModelPath != null) {
sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
}
String text = IOUtils.slurpFileNoExceptions(inputPath);
// need blank line to make a new chunk
String[] chunks = text.split("\\n\\s*\\n+");
for (String chunk : chunks) {
if (chunk.trim().isEmpty()) {
continue;
}
// The expected format is that line 0 will be the text of the
// sentence, and each subsequence line, if any, will be a value
// followed by the sequence of tokens that get that value.
// Here we take the first line and tokenize it as one sentence.
String[] lines = chunk.trim().split("\\n");
String sentence = lines[0];
StringReader sin = new StringReader(sentence);
DocumentPreprocessor document = new DocumentPreprocessor(sin);
document.setSentenceFinalPuncWords(new String[] { "\n" });
List<HasWord> tokens = document.iterator().next();
Integer mainLabel = new Integer(tokens.get(0).word());
//System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
tokens = tokens.subList(1, tokens.size());
//log.info(tokens);
Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
for (int i = 1; i < lines.length; ++i) {
extractLabels(spanToLabels, tokens, lines[i]);
}
// TODO: add an option which treats the spans as constraints when parsing
Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
// label here and then use the user given labels to adjust
if (sentimentModel != null) {
Trees.convertToCoreLabels(collapsedUnary);
SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
scorer.forwardPropagateTree(collapsedUnary);
setPredictedLabels(collapsedUnary);
} else {
setUnknownLabels(collapsedUnary, mainLabel);
}
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
}
System.out.println(collapsedUnary);
//System.out.println();
}
}
use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.
the class CrossValidateTestOptions method main.
public static void main(String[] args) throws IOException, ClassNotFoundException {
String dvmodelFile = null;
String lexparserFile = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = new ArrayList<>();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-lexparser")) {
lexparserFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else {
unusedArgs.add(args[argIndex++]);
}
}
log.info("Loading lexparser from: " + lexparserFile);
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser lexparser = LexicalizedParser.loadModel(lexparserFile, newArgs);
log.info("... done");
Treebank testTreebank = null;
if (testTreebankPath != null) {
log.info("Reading in trees from " + testTreebankPath);
if (testTreebankFilter != null) {
log.info("Filtering on " + testTreebankFilter);
}
testTreebank = lexparser.getOp().tlpParams.memoryTreebank();
;
testTreebank.loadPath(testTreebankPath, testTreebankFilter);
log.info("Read in " + testTreebank.size() + " trees for testing");
}
double[] labelResults = new double[weights.length];
double[] tagResults = new double[weights.length];
for (int i = 0; i < weights.length; ++i) {
lexparser.getOp().baseParserWeight = weights[i];
EvaluateTreebank evaluator = new EvaluateTreebank(lexparser);
evaluator.testOnTreebank(testTreebank);
labelResults[i] = evaluator.getLBScore();
tagResults[i] = evaluator.getTagScore();
}
for (int i = 0; i < weights.length; ++i) {
log.info("LexicalizedParser weight " + weights[i] + ": labeled " + labelResults[i] + " tag " + tagResults[i]);
}
}
use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.
the class DumpMatrices method main.
public static void main(String[] args) throws IOException {
String modelPath = null;
String outputDir = null;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-output")) {
outputDir = args[argIndex + 1];
argIndex += 2;
} else {
log.info("Unknown argument " + args[argIndex]);
help();
}
}
if (outputDir == null || modelPath == null) {
help();
}
File outputFile = new File(outputDir);
FileSystem.checkNotExistsOrFail(outputFile);
FileSystem.mkdirOrFail(outputFile);
LexicalizedParser parser = LexicalizedParser.loadModel(modelPath);
DVModel model = DVParser.getModelFromLexicalizedParser(parser);
String binaryWDir = outputDir + File.separator + "binaryW";
FileSystem.mkdirOrFail(binaryWDir);
for (TwoDimensionalMap.Entry<String, String, SimpleMatrix> entry : model.binaryTransform) {
String filename = binaryWDir + File.separator + entry.getFirstKey() + "_" + entry.getSecondKey() + ".txt";
dumpMatrix(filename, entry.getValue());
}
String binaryScoreDir = outputDir + File.separator + "binaryScore";
FileSystem.mkdirOrFail(binaryScoreDir);
for (TwoDimensionalMap.Entry<String, String, SimpleMatrix> entry : model.binaryScore) {
String filename = binaryScoreDir + File.separator + entry.getFirstKey() + "_" + entry.getSecondKey() + ".txt";
dumpMatrix(filename, entry.getValue());
}
String unaryWDir = outputDir + File.separator + "unaryW";
FileSystem.mkdirOrFail(unaryWDir);
for (Map.Entry<String, SimpleMatrix> entry : model.unaryTransform.entrySet()) {
String filename = unaryWDir + File.separator + entry.getKey() + ".txt";
dumpMatrix(filename, entry.getValue());
}
String unaryScoreDir = outputDir + File.separator + "unaryScore";
FileSystem.mkdirOrFail(unaryScoreDir);
for (Map.Entry<String, SimpleMatrix> entry : model.unaryScore.entrySet()) {
String filename = unaryScoreDir + File.separator + entry.getKey() + ".txt";
dumpMatrix(filename, entry.getValue());
}
String embeddingFile = outputDir + File.separator + "embeddings.txt";
FileWriter fout = new FileWriter(embeddingFile);
BufferedWriter bout = new BufferedWriter(fout);
for (Map.Entry<String, SimpleMatrix> entry : model.wordVectors.entrySet()) {
bout.write(entry.getKey());
SimpleMatrix vector = entry.getValue();
for (int i = 0; i < vector.numRows(); ++i) {
bout.write(" " + vector.get(i, 0));
}
bout.write("\n");
}
bout.close();
fout.close();
}
Aggregations