use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.
the class JointParsingModel method run.
public boolean run(File trainTreebankFile, File testTreebankFile, InputStream inputStream) {
op = new Options();
op.tlpParams = new ArabicTreebankParserParams();
op.setOptions("-arabicFactored");
op.testOptions.maxLength = maxSentLen;
//500000 is the default for Arabic, but we have substantially more edges now
op.testOptions.MAX_ITEMS = 5000000;
op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies";
// WSG: Just set this to some high value so that extractBestParse()
// actually calls the lattice reader (e.g., this says that we can't have a word longer than
// 80 characters...seems sensible for Arabic
op.testOptions.maxSpanForTags = 80;
treePrint = op.testOptions.treePrint(op.tlpParams);
debinarizer = new Debinarizer(op.forceCNF, new CategoryWordTagFactory());
subcategoryStripper = op.tlpParams.subcategoryStripper();
Timing.startTime();
final Treebank trainTreebank = op.tlpParams.diskTreebank();
trainTreebank.loadPath(trainTreebankFile);
lp = getParserDataFromTreebank(trainTreebank);
makeParsers();
if (VERBOSE) {
op.display();
String lexNumRules = (pparser != null) ? Integer.toString(lp.lex.numRules()) : "";
log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (pparser != null ? lp.ug.numRules() : "") + '\t' + (pparser != null ? lp.bg.numRules() : "") + '\t' + lexNumRules);
log.info("ParserPack is " + op.tlpParams.getClass().getName());
log.info("Lexicon is " + lp.lex.getClass().getName());
}
return parse(inputStream);
}
use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.
the class CacheParseHypotheses method main.
/**
* An example of a command line is
* <br>
* java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
* <br>
* java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
* <br>
* java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed 026-270,301-499,600-999
*/
public static void main(String[] args) throws IOException {
String parserModel = null;
String output = null;
List<Pair<String, FileFilter>> treebanks = Generics.newArrayList();
int dvKBest = 200;
int numThreads = 1;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-dvKBest")) {
dvKBest = Integer.valueOf(args[argIndex + 1]);
argIndex += 2;
continue;
}
if (args[argIndex].equalsIgnoreCase("-parser") || args[argIndex].equals("-model")) {
parserModel = args[argIndex + 1];
argIndex += 2;
continue;
}
if (args[argIndex].equalsIgnoreCase("-output")) {
output = args[argIndex + 1];
argIndex += 2;
continue;
}
if (args[argIndex].equalsIgnoreCase("-treebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-treebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
treebanks.add(treebankDescription);
continue;
}
if (args[argIndex].equalsIgnoreCase("-numThreads")) {
numThreads = Integer.valueOf(args[argIndex + 1]);
argIndex += 2;
continue;
}
throw new IllegalArgumentException("Unknown argument " + args[argIndex]);
}
if (parserModel == null) {
throw new IllegalArgumentException("Need to supply a parser model with -model");
}
if (output == null) {
throw new IllegalArgumentException("Need to supply an output filename with -output");
}
if (treebanks.size() == 0) {
throw new IllegalArgumentException("Need to supply a treebank with -treebank");
}
log.info("Writing output to " + output);
log.info("Loading parser model " + parserModel);
log.info("Writing " + dvKBest + " hypothesis trees for each tree");
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel, "-dvKBest", Integer.toString(dvKBest));
CacheParseHypotheses cacher = new CacheParseHypotheses(parser);
TreeTransformer transformer = DVParser.buildTrainTransformer(parser.getOp());
List<Tree> sentences = new ArrayList<>();
for (Pair<String, FileFilter> description : treebanks) {
log.info("Reading trees from " + description.first);
Treebank treebank = parser.getOp().tlpParams.memoryTreebank();
treebank.loadPath(description.first, description.second);
treebank = treebank.transform(transformer);
sentences.addAll(treebank);
}
log.info("Processing " + sentences.size() + " trees");
List<Pair<Tree, byte[]>> cache = Generics.newArrayList();
transformer = new SynchronizedTreeTransformer(transformer);
MulticoreWrapper<Tree, Pair<Tree, byte[]>> wrapper = new MulticoreWrapper<>(numThreads, new CacheProcessor(cacher, parser, dvKBest, transformer));
for (Tree tree : sentences) {
wrapper.put(tree);
while (wrapper.peek()) {
cache.add(wrapper.poll());
if (cache.size() % 10 == 0) {
System.out.println("Processed " + cache.size() + " trees");
}
}
}
wrapper.join();
while (wrapper.peek()) {
cache.add(wrapper.poll());
if (cache.size() % 10 == 0) {
System.out.println("Processed " + cache.size() + " trees");
}
}
System.out.println("Finished processing " + cache.size() + " trees");
IOUtils.writeObjectToFile(cache, output);
}
use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.
the class CombineDVModels method main.
public static void main(String[] args) throws IOException, ClassNotFoundException {
String modelPath = null;
List<String> baseModelPaths = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = new ArrayList<>();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-baseModels")) {
argIndex++;
baseModelPaths = new ArrayList<>();
while (argIndex < args.length && args[argIndex].charAt(0) != '-') {
baseModelPaths.add(args[argIndex++]);
}
if (baseModelPaths.size() == 0) {
throw new IllegalArgumentException("Found an argument -baseModels with no actual models named");
}
} else {
unusedArgs.add(args[argIndex++]);
}
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser underlyingParser = null;
Options options = null;
LexicalizedParser combinedParser = null;
if (baseModelPaths != null) {
List<DVModel> dvparsers = new ArrayList<>();
for (String baseModelPath : baseModelPaths) {
log.info("Loading serialized DVParser from " + baseModelPath);
LexicalizedParser dvparser = LexicalizedParser.loadModel(baseModelPath);
Reranker reranker = dvparser.reranker;
if (!(reranker instanceof DVModelReranker)) {
throw new IllegalArgumentException("Expected parsers with DVModel embedded");
}
dvparsers.add(((DVModelReranker) reranker).getModel());
if (underlyingParser == null) {
underlyingParser = dvparser;
options = underlyingParser.getOp();
// TODO: other parser's options?
options.setOptions(newArgs);
}
log.info("... done");
}
combinedParser = LexicalizedParser.copyLexicalizedParser(underlyingParser);
CombinedDVModelReranker reranker = new CombinedDVModelReranker(options, dvparsers);
combinedParser.reranker = reranker;
combinedParser.saveParserToSerialized(modelPath);
} else {
throw new IllegalArgumentException("Need to specify -model to load an already prepared CombinedParser");
}
Treebank testTreebank = null;
if (testTreebankPath != null) {
log.info("Reading in trees from " + testTreebankPath);
if (testTreebankFilter != null) {
log.info("Filtering on " + testTreebankFilter);
}
testTreebank = combinedParser.getOp().tlpParams.memoryTreebank();
;
testTreebank.loadPath(testTreebankPath, testTreebankFilter);
log.info("Read in " + testTreebank.size() + " trees for testing");
EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.getOp(), null, combinedParser);
evaluator.testOnTreebank(testTreebank);
}
}
use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.
the class BaseLexicon method main.
/** Provides some testing and opportunities for exploration of the
* probabilities of a BaseLexicon. What's here currently probably
* only works for the English Penn Treeebank, as it uses default
* constructors. Of the words given to test on,
* the first is treated as sentence initial, and the rest as not
* sentence initial.
*
* @param args The command line arguments:
* java BaseLexicon treebankPath fileRange unknownWordModel words*
*/
public static void main(String[] args) {
if (args.length < 3) {
log.info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
return;
}
System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
Treebank tb = new DiskTreebank();
tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
// TODO: change this interface so the lexicon creates its own indices?
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Options op = new Options();
op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
lex.initializeTraining(tb.size());
lex.train(tb);
lex.finishTraining();
System.out.println("done.");
System.out.println();
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(4);
List<String> impos = new ArrayList<>();
for (int i = 3; i < args.length; i++) {
if (lex.isKnown(args[i])) {
System.out.println(args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:");
for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
IntTaggedWord iTW = it.next();
System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
}
} else {
String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
impos.clear();
List<String> lis = new ArrayList<>(tagIndex.objectsList());
Collections.sort(lis);
for (String tStr : lis) {
IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
double score = lex.score(iTW, 1, args[i], null);
if (score == Float.NEGATIVE_INFINITY) {
impos.add(tStr);
} else {
System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
}
}
if (impos.size() > 0) {
System.out.println(args[i] + " impossible tags: " + impos);
}
}
System.out.println();
}
}
use of edu.stanford.nlp.trees.Treebank in project CoreNLP by stanfordnlp.
the class ShiftReduceParser method train.
private void train(List<Pair<String, FileFilter>> trainTreebankPath, Pair<String, FileFilter> devTreebankPath, String serializedPath) {
log.info("Training method: " + op.trainOptions().trainingMethod);
List<Tree> binarizedTrees = Generics.newArrayList();
for (Pair<String, FileFilter> treebank : trainTreebankPath) {
binarizedTrees.addAll(readBinarizedTreebank(treebank.first(), treebank.second()));
}
int nThreads = op.trainOptions.trainingThreads;
nThreads = nThreads <= 0 ? Runtime.getRuntime().availableProcessors() : nThreads;
Tagger tagger = null;
if (op.testOptions.preTag) {
Timing retagTimer = new Timing();
tagger = Tagger.loadModel(op.testOptions.taggerSerializedFile);
redoTags(binarizedTrees, tagger, nThreads);
retagTimer.done("Retagging");
}
Set<String> knownStates = findKnownStates(binarizedTrees);
Set<String> rootStates = findRootStates(binarizedTrees);
Set<String> rootOnlyStates = findRootOnlyStates(binarizedTrees, rootStates);
log.info("Known states: " + knownStates);
log.info("States which occur at the root: " + rootStates);
log.info("States which only occur at the root: " + rootStates);
Timing transitionTimer = new Timing();
List<List<Transition>> transitionLists = CreateTransitionSequence.createTransitionSequences(binarizedTrees, op.compoundUnaries, rootStates, rootOnlyStates);
Index<Transition> transitionIndex = new HashIndex<>();
for (List<Transition> transitions : transitionLists) {
transitionIndex.addAll(transitions);
}
transitionTimer.done("Converting trees into transition lists");
log.info("Number of transitions: " + transitionIndex.size());
Random random = new Random(op.trainOptions.randomSeed);
Treebank devTreebank = null;
if (devTreebankPath != null) {
devTreebank = readTreebank(devTreebankPath.first(), devTreebankPath.second());
}
PerceptronModel newModel = new PerceptronModel(this.op, transitionIndex, knownStates, rootStates, rootOnlyStates);
newModel.trainModel(serializedPath, tagger, random, binarizedTrees, transitionLists, devTreebank, nThreads);
this.model = newModel;
}
Aggregations