use of java.io.FileFilter in project CoreNLP by stanfordnlp.
the class FindNearestNeighbors method main.
public static void main(String[] args) throws Exception {
String modelPath = null;
String outputPath = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = new ArrayList<>();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-output")) {
outputPath = args[argIndex + 1];
argIndex += 2;
} else {
unusedArgs.add(args[argIndex++]);
}
}
if (modelPath == null) {
throw new IllegalArgumentException("Need to specify -model");
}
if (testTreebankPath == null) {
throw new IllegalArgumentException("Need to specify -testTreebank");
}
if (outputPath == null) {
throw new IllegalArgumentException("Need to specify -output");
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser lexparser = LexicalizedParser.loadModel(modelPath, newArgs);
Treebank testTreebank = null;
if (testTreebankPath != null) {
log.info("Reading in trees from " + testTreebankPath);
if (testTreebankFilter != null) {
log.info("Filtering on " + testTreebankFilter);
}
testTreebank = lexparser.getOp().tlpParams.memoryTreebank();
;
testTreebank.loadPath(testTreebankPath, testTreebankFilter);
log.info("Read in " + testTreebank.size() + " trees for testing");
}
FileWriter out = new FileWriter(outputPath);
BufferedWriter bout = new BufferedWriter(out);
log.info("Parsing " + testTreebank.size() + " trees");
int count = 0;
List<ParseRecord> records = Generics.newArrayList();
for (Tree goldTree : testTreebank) {
List<Word> tokens = goldTree.yieldWords();
ParserQuery parserQuery = lexparser.parserQuery();
if (!parserQuery.parse(tokens)) {
throw new AssertionError("Could not parse: " + tokens);
}
if (!(parserQuery instanceof RerankingParserQuery)) {
throw new IllegalArgumentException("Expected a LexicalizedParser with a Reranker attached");
}
RerankingParserQuery rpq = (RerankingParserQuery) parserQuery;
if (!(rpq.rerankerQuery() instanceof DVModelReranker.Query)) {
throw new IllegalArgumentException("Expected a LexicalizedParser with a DVModel attached");
}
DeepTree tree = ((DVModelReranker.Query) rpq.rerankerQuery()).getDeepTrees().get(0);
SimpleMatrix rootVector = null;
for (Map.Entry<Tree, SimpleMatrix> entry : tree.getVectors().entrySet()) {
if (entry.getKey().label().value().equals("ROOT")) {
rootVector = entry.getValue();
break;
}
}
if (rootVector == null) {
throw new AssertionError("Could not find root nodevector");
}
out.write(tokens + "\n");
out.write(tree.getTree() + "\n");
for (int i = 0; i < rootVector.getNumElements(); ++i) {
out.write(" " + rootVector.get(i));
}
out.write("\n\n\n");
count++;
if (count % 10 == 0) {
log.info(" " + count);
}
records.add(new ParseRecord(tokens, goldTree, tree.getTree(), rootVector, tree.getVectors()));
}
log.info(" done parsing");
List<Pair<Tree, SimpleMatrix>> subtrees = Generics.newArrayList();
for (ParseRecord record : records) {
for (Map.Entry<Tree, SimpleMatrix> entry : record.nodeVectors.entrySet()) {
if (entry.getKey().getLeaves().size() <= maxLength) {
subtrees.add(Pair.makePair(entry.getKey(), entry.getValue()));
}
}
}
log.info("There are " + subtrees.size() + " subtrees in the set of trees");
PriorityQueue<ScoredObject<Pair<Tree, Tree>>> bestmatches = new PriorityQueue<>(101, ScoredComparator.DESCENDING_COMPARATOR);
for (int i = 0; i < subtrees.size(); ++i) {
log.info(subtrees.get(i).first().yieldWords());
log.info(subtrees.get(i).first());
for (int j = 0; j < subtrees.size(); ++j) {
if (i == j) {
continue;
}
// TODO: look at basic category?
double normF = subtrees.get(i).second().minus(subtrees.get(j).second()).normF();
bestmatches.add(new ScoredObject<>(Pair.makePair(subtrees.get(i).first(), subtrees.get(j).first()), normF));
if (bestmatches.size() > 100) {
bestmatches.poll();
}
}
List<ScoredObject<Pair<Tree, Tree>>> ordered = Generics.newArrayList();
while (bestmatches.size() > 0) {
ordered.add(bestmatches.poll());
}
Collections.reverse(ordered);
for (ScoredObject<Pair<Tree, Tree>> pair : ordered) {
log.info(" MATCHED " + pair.object().second.yieldWords() + " ... " + pair.object().second() + " with a score of " + pair.score());
}
log.info();
log.info();
bestmatches.clear();
}
/*
for (int i = 0; i < records.size(); ++i) {
if (i % 10 == 0) {
log.info(" " + i);
}
List<ScoredObject<ParseRecord>> scored = Generics.newArrayList();
for (int j = 0; j < records.size(); ++j) {
if (i == j) continue;
double score = 0.0;
int matches = 0;
for (Map.Entry<Tree, SimpleMatrix> first : records.get(i).nodeVectors.entrySet()) {
for (Map.Entry<Tree, SimpleMatrix> second : records.get(j).nodeVectors.entrySet()) {
String firstBasic = dvparser.dvModel.basicCategory(first.getKey().label().value());
String secondBasic = dvparser.dvModel.basicCategory(second.getKey().label().value());
if (firstBasic.equals(secondBasic)) {
++matches;
double normF = first.getValue().minus(second.getValue()).normF();
score += normF * normF;
}
}
}
if (matches == 0) {
score = Double.POSITIVE_INFINITY;
} else {
score = score / matches;
}
//double score = records.get(i).vector.minus(records.get(j).vector).normF();
scored.add(new ScoredObject<ParseRecord>(records.get(j), score));
}
Collections.sort(scored, ScoredComparator.ASCENDING_COMPARATOR);
out.write(records.get(i).sentence.toString() + "\n");
for (int j = 0; j < numNeighbors; ++j) {
out.write(" " + scored.get(j).score() + ": " + scored.get(j).object().sentence + "\n");
}
out.write("\n\n");
}
log.info();
*/
bout.flush();
out.flush();
out.close();
}
use of java.io.FileFilter in project CoreNLP by stanfordnlp.
the class ParseAndPrintMatrices method main.
public static void main(String[] args) throws IOException {
String modelPath = null;
String outputPath = null;
String inputPath = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = Generics.newArrayList();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-output")) {
outputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else {
unusedArgs.add(args[argIndex++]);
}
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser parser = LexicalizedParser.loadModel(modelPath, newArgs);
DVModel model = DVParser.getModelFromLexicalizedParser(parser);
File outputFile = new File(outputPath);
FileSystem.checkNotExistsOrFail(outputFile);
FileSystem.mkdirOrFail(outputFile);
int count = 0;
if (inputPath != null) {
Reader input = new BufferedReader(new FileReader(inputPath));
DocumentPreprocessor processor = new DocumentPreprocessor(input);
for (List<HasWord> sentence : processor) {
// index from 1
count++;
ParserQuery pq = parser.parserQuery();
if (!(pq instanceof RerankingParserQuery)) {
throw new IllegalArgumentException("Expected a RerankingParserQuery");
}
RerankingParserQuery rpq = (RerankingParserQuery) pq;
if (!rpq.parse(sentence)) {
throw new RuntimeException("Unparsable sentence: " + sentence);
}
RerankerQuery reranker = rpq.rerankerQuery();
if (!(reranker instanceof DVModelReranker.Query)) {
throw new IllegalArgumentException("Expected a DVModelReranker");
}
DeepTree deepTree = ((DVModelReranker.Query) reranker).getDeepTrees().get(0);
IdentityHashMap<Tree, SimpleMatrix> vectors = deepTree.getVectors();
for (Map.Entry<Tree, SimpleMatrix> entry : vectors.entrySet()) {
log.info(entry.getKey() + " " + entry.getValue());
}
FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
BufferedWriter bout = new BufferedWriter(fout);
bout.write(SentenceUtils.listToString(sentence));
bout.newLine();
bout.write(deepTree.getTree().toString());
bout.newLine();
for (HasWord word : sentence) {
outputMatrix(bout, model.getWordVector(word.word()));
}
Tree rootTree = findRootTree(vectors);
outputTreeMatrices(bout, rootTree, vectors);
bout.flush();
fout.close();
}
}
}
use of java.io.FileFilter in project CoreNLP by stanfordnlp.
the class ChineseCharacterBasedLexiconTraining method main.
public static void main(String[] args) throws IOException {
Map<String, Integer> flagsToNumArgs = Generics.newHashMap();
flagsToNumArgs.put("-parser", Integer.valueOf(3));
flagsToNumArgs.put("-lex", Integer.valueOf(3));
flagsToNumArgs.put("-test", Integer.valueOf(2));
flagsToNumArgs.put("-out", Integer.valueOf(1));
flagsToNumArgs.put("-lengthPenalty", Integer.valueOf(1));
flagsToNumArgs.put("-penaltyType", Integer.valueOf(1));
flagsToNumArgs.put("-maxLength", Integer.valueOf(1));
flagsToNumArgs.put("-stats", Integer.valueOf(2));
Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs);
boolean eval = argMap.containsKey("-eval");
PrintWriter pw = null;
if (argMap.containsKey("-out")) {
pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap.get("-out"))[0]), "GB18030"), true);
}
log.info("ChineseCharacterBasedLexicon called with args:");
ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
for (int i = 0; i < args.length; i++) {
ctpp.setOptionFlag(args, i);
log.info(" " + args[i]);
}
log.info();
Options op = new Options(ctpp);
if (argMap.containsKey("-stats")) {
String[] statArgs = (argMap.get("-stats"));
MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
FileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false);
rawTrainTreebank.loadPath(new File(statArgs[0]), trainFilt);
log.info("Done reading trees.");
MemoryTreebank trainTreebank;
if (argMap.containsKey("-annotate")) {
trainTreebank = new MemoryTreebank();
TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
for (Tree tree : rawTrainTreebank) {
trainTreebank.add(annotator.transformTree(tree));
}
log.info("Done annotating trees.");
} else {
trainTreebank = rawTrainTreebank;
}
printStats(trainTreebank, pw);
System.exit(0);
}
int maxLength = 1000000;
// Test.verbose = true;
if (argMap.containsKey("-norm")) {
op.testOptions.lengthNormalization = true;
}
if (argMap.containsKey("-maxLength")) {
maxLength = Integer.parseInt((argMap.get("-maxLength"))[0]);
}
op.testOptions.maxLength = 120;
boolean combo = argMap.containsKey("-combo");
if (combo) {
ctpp.useCharacterBasedLexicon = true;
op.testOptions.maxSpanForTags = 10;
op.doDep = false;
op.dcTags = false;
}
LexicalizedParser lp = null;
Lexicon lex = null;
if (argMap.containsKey("-parser")) {
String[] parserArgs = (argMap.get("-parser"));
if (parserArgs.length > 1) {
FileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
lp = LexicalizedParser.trainFromTreebank(parserArgs[0], trainFilt, op);
if (parserArgs.length == 3) {
String filename = parserArgs[2];
log.info("Writing parser in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lp);
out.close();
log.info("done.");
}
} else {
String parserFile = parserArgs[0];
lp = LexicalizedParser.loadModel(parserFile, op);
}
lex = lp.getLexicon();
op = lp.getOp();
ctpp = (ChineseTreebankParserParams) op.tlpParams;
}
if (argMap.containsKey("-rad")) {
ctpp.useUnknownCharacterModel = true;
}
if (argMap.containsKey("-lengthPenalty")) {
ctpp.lengthPenalty = Double.parseDouble((argMap.get("-lengthPenalty"))[0]);
}
if (argMap.containsKey("-penaltyType")) {
ctpp.penaltyType = Integer.parseInt((argMap.get("-penaltyType"))[0]);
}
if (argMap.containsKey("-lex")) {
String[] lexArgs = (argMap.get("-lex"));
if (lexArgs.length > 1) {
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
lex = ctpp.lex(op, wordIndex, tagIndex);
MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
FileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false);
rawTrainTreebank.loadPath(new File(lexArgs[0]), trainFilt);
log.info("Done reading trees.");
MemoryTreebank trainTreebank;
if (argMap.containsKey("-annotate")) {
trainTreebank = new MemoryTreebank();
TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
for (Tree tree : rawTrainTreebank) {
tree = annotator.transformTree(tree);
trainTreebank.add(tree);
}
log.info("Done annotating trees.");
} else {
trainTreebank = rawTrainTreebank;
}
lex.initializeTraining(trainTreebank.size());
lex.train(trainTreebank);
lex.finishTraining();
log.info("Done training lexicon.");
if (lexArgs.length == 3) {
String filename = lexArgs.length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
log.info("Writing lexicon in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lex);
out.close();
log.info("done.");
}
} else {
String lexFile = lexArgs.length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
log.info("Reading Lexicon from file " + lexFile);
ObjectInputStream in = IOUtils.readStreamFromString(lexFile);
try {
lex = (Lexicon) in.readObject();
} catch (ClassNotFoundException e) {
throw new RuntimeException("Bad serialized file: " + lexFile);
}
in.close();
}
}
if (argMap.containsKey("-test")) {
boolean segmentWords = ctpp.segment;
boolean parse = lp != null;
assert (parse || segmentWords);
// WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
// WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
WordSegmenter seg = null;
if (segmentWords) {
seg = (WordSegmenter) lex;
}
String[] testArgs = (argMap.get("-test"));
MemoryTreebank testTreebank = op.tlpParams.memoryTreebank();
FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
testTreebank.loadPath(new File(testArgs[0]), testFilt);
TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
TreeTransformer collinizer = ctpp.collinizer();
WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic");
EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
List<String> evalTypes = new ArrayList<>(3);
boolean goodPOS = false;
if (segmentWords) {
evalTypes.add(WordCatConstituent.wordType);
if (ctpp.segmentMarkov && !parse) {
evalTypes.add(WordCatConstituent.tagType);
goodPOS = true;
}
}
if (parse) {
evalTypes.add(WordCatConstituent.tagType);
evalTypes.add(WordCatConstituent.catType);
if (combo) {
evalTypes.add(WordCatConstituent.wordType);
goodPOS = true;
}
}
TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
log.info("Testing...");
for (Tree goldTop : testTreebank) {
Tree gold = goldTop.firstChild();
List<HasWord> goldSentence = gold.yieldHasWord();
if (goldSentence.size() > maxLength) {
log.info("Skipping sentence; too long: " + goldSentence.size());
continue;
} else {
log.info("Processing sentence; length: " + goldSentence.size());
}
List<HasWord> s;
if (segmentWords) {
StringBuilder goldCharBuf = new StringBuilder();
for (HasWord aGoldSentence : goldSentence) {
StringLabel word = (StringLabel) aGoldSentence;
goldCharBuf.append(word.value());
}
String goldChars = goldCharBuf.toString();
s = seg.segment(goldChars);
} else {
s = goldSentence;
}
Tree tree;
if (parse) {
tree = lp.parseTree(s);
if (tree == null) {
throw new RuntimeException("PARSER RETURNED NULL!!!");
}
} else {
tree = Trees.toFlatTree(s);
tree = subcategoryStripper.transformTree(tree);
}
if (pw != null) {
if (parse) {
tree.pennPrint(pw);
} else {
Iterator sentIter = s.iterator();
for (; ; ) {
Word word = (Word) sentIter.next();
pw.print(word.word());
if (sentIter.hasNext()) {
pw.print(" ");
} else {
break;
}
}
}
pw.println();
}
if (eval) {
Collection ourBrackets, goldBrackets;
ourBrackets = proc.allBrackets(tree);
goldBrackets = proc.allBrackets(gold);
if (goodPOS) {
ourBrackets.addAll(proc.commonWordTagTypeBrackets(tree, gold));
goldBrackets.addAll(proc.commonWordTagTypeBrackets(gold, tree));
}
basicEval.eval(ourBrackets, goldBrackets);
System.out.println("\nScores:");
basicEval.displayLast();
Tree collinsTree = collinizer.transformTree(tree);
Tree collinsGold = collinizer.transformTree(gold);
ourBrackets = proc.allBrackets(collinsTree);
goldBrackets = proc.allBrackets(collinsGold);
if (goodPOS) {
ourBrackets.addAll(proc.commonWordTagTypeBrackets(collinsTree, collinsGold));
goldBrackets.addAll(proc.commonWordTagTypeBrackets(collinsGold, collinsTree));
}
collinsEval.eval(ourBrackets, goldBrackets);
System.out.println("\nCollinized scores:");
collinsEval.displayLast();
System.out.println();
}
}
if (eval) {
basicEval.display();
System.out.println();
collinsEval.display();
}
}
}
use of java.io.FileFilter in project CoreNLP by stanfordnlp.
the class DVParser method main.
/**
* An example command line for training a new parser:
* <br>
* nohup java -mx6g edu.stanford.nlp.parser.dvparser.DVParser -cachedTrees /scr/nlp/data/dvparser/wsj/cached.wsj.train.simple.ser.gz -train -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 -debugOutputFrequency 400 -nofilter -trainingThreads 5 -parser /u/nlp/data/lexparser/wsjPCFG.nocompact.simple.ser.gz -trainingIterations 40 -batchSize 25 -model /scr/nlp/data/dvparser/wsj/wsj.combine.v2.ser.gz -unkWord "*UNK*" -dvCombineCategories > /scr/nlp/data/dvparser/wsj/wsj.combine.v2.out 2>&1 &
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
if (args.length == 0) {
help();
System.exit(2);
}
log.info("Running DVParser with arguments:");
for (String arg : args) {
log.info(" " + arg);
}
log.info();
String parserPath = null;
String trainTreebankPath = null;
FileFilter trainTreebankFilter = null;
String cachedTrainTreesPath = null;
boolean runGradientCheck = false;
boolean runTraining = false;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
String initialModelPath = null;
String modelPath = null;
boolean filter = true;
String resultsRecordPath = null;
List<String> unusedArgs = new ArrayList<>();
// These parameters can be null or 0 if the model was not
// serialized with the new parameters. Setting the options at the
// command line will override these defaults.
// TODO: if/when we integrate back into the main branch and
// rebuild models, we can get rid of this
List<String> argsWithDefaults = new ArrayList<>(Arrays.asList(new String[] { "-wordVectorFile", Options.LexOptions.DEFAULT_WORD_VECTOR_FILE, "-dvKBest", Integer.toString(TrainOptions.DEFAULT_K_BEST), "-batchSize", Integer.toString(TrainOptions.DEFAULT_BATCH_SIZE), "-trainingIterations", Integer.toString(TrainOptions.DEFAULT_TRAINING_ITERATIONS), "-qnIterationsPerBatch", Integer.toString(TrainOptions.DEFAULT_QN_ITERATIONS_PER_BATCH), "-regCost", Double.toString(TrainOptions.DEFAULT_REGCOST), "-learningRate", Double.toString(TrainOptions.DEFAULT_LEARNING_RATE), "-deltaMargin", Double.toString(TrainOptions.DEFAULT_DELTA_MARGIN), "-unknownNumberVector", "-unknownDashedWordVectors", "-unknownCapsVector", "-unknownchinesepercentvector", "-unknownchinesenumbervector", "-unknownchineseyearvector", "-unkWord", "*UNK*", "-transformMatrixType", "DIAGONAL", "-scalingForInit", Double.toString(TrainOptions.DEFAULT_SCALING_FOR_INIT), "-trainWordVectors" }));
argsWithDefaults.addAll(Arrays.asList(args));
args = argsWithDefaults.toArray(new String[argsWithDefaults.size()]);
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-parser")) {
parserPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-treebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-treebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
trainTreebankPath = treebankDescription.first();
trainTreebankFilter = treebankDescription.second();
} else if (args[argIndex].equalsIgnoreCase("-cachedTrees")) {
cachedTrainTreesPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-runGradientCheck")) {
runGradientCheck = true;
argIndex++;
} else if (args[argIndex].equalsIgnoreCase("-train")) {
runTraining = true;
argIndex++;
} else if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-nofilter")) {
filter = false;
argIndex++;
} else if (args[argIndex].equalsIgnoreCase("-continueTraining")) {
runTraining = true;
filter = false;
initialModelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-resultsRecord")) {
resultsRecordPath = args[argIndex + 1];
argIndex += 2;
} else {
unusedArgs.add(args[argIndex++]);
}
}
if (parserPath == null && modelPath == null) {
throw new IllegalArgumentException("Must supply either a base parser model with -parser or a serialized DVParser with -model");
}
if (!runTraining && modelPath == null && !runGradientCheck) {
throw new IllegalArgumentException("Need to either train a new model, run the gradient check or specify a model to load with -model");
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
DVParser dvparser = null;
LexicalizedParser lexparser = null;
if (initialModelPath != null) {
lexparser = LexicalizedParser.loadModel(initialModelPath, newArgs);
DVModel model = getModelFromLexicalizedParser(lexparser);
dvparser = new DVParser(model, lexparser);
} else if (runTraining || runGradientCheck) {
lexparser = LexicalizedParser.loadModel(parserPath, newArgs);
dvparser = new DVParser(lexparser);
} else if (modelPath != null) {
lexparser = LexicalizedParser.loadModel(modelPath, newArgs);
DVModel model = getModelFromLexicalizedParser(lexparser);
dvparser = new DVParser(model, lexparser);
}
List<Tree> trainSentences = new ArrayList<>();
IdentityHashMap<Tree, byte[]> trainCompressedParses = Generics.newIdentityHashMap();
if (cachedTrainTreesPath != null) {
for (String path : cachedTrainTreesPath.split(",")) {
List<Pair<Tree, byte[]>> cache = IOUtils.readObjectFromFile(path);
for (Pair<Tree, byte[]> pair : cache) {
trainSentences.add(pair.first());
trainCompressedParses.put(pair.first(), pair.second());
}
log.info("Read in " + cache.size() + " trees from " + path);
}
}
if (trainTreebankPath != null) {
// TODO: make the transformer a member of the model?
TreeTransformer transformer = buildTrainTransformer(dvparser.getOp());
Treebank treebank = dvparser.getOp().tlpParams.memoryTreebank();
;
treebank.loadPath(trainTreebankPath, trainTreebankFilter);
treebank = treebank.transform(transformer);
log.info("Read in " + treebank.size() + " trees from " + trainTreebankPath);
CacheParseHypotheses cacher = new CacheParseHypotheses(dvparser.parser);
CacheParseHypotheses.CacheProcessor processor = new CacheParseHypotheses.CacheProcessor(cacher, lexparser, dvparser.op.trainOptions.dvKBest, transformer);
for (Tree tree : treebank) {
trainSentences.add(tree);
trainCompressedParses.put(tree, processor.process(tree).second);
//System.out.println(tree);
}
log.info("Finished parsing " + treebank.size() + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each");
}
if ((runTraining || runGradientCheck) && filter) {
log.info("Filtering rules for the given training set");
dvparser.dvModel.setRulesForTrainingSet(trainSentences, trainCompressedParses);
log.info("Done filtering rules; " + dvparser.dvModel.numBinaryMatrices + " binary matrices, " + dvparser.dvModel.numUnaryMatrices + " unary matrices, " + dvparser.dvModel.wordVectors.size() + " word vectors");
}
//dvparser.dvModel.printAllMatrices();
Treebank testTreebank = null;
if (testTreebankPath != null) {
log.info("Reading in trees from " + testTreebankPath);
if (testTreebankFilter != null) {
log.info("Filtering on " + testTreebankFilter);
}
testTreebank = dvparser.getOp().tlpParams.memoryTreebank();
;
testTreebank.loadPath(testTreebankPath, testTreebankFilter);
log.info("Read in " + testTreebank.size() + " trees for testing");
}
// runGradientCheck= true;
if (runGradientCheck) {
log.info("Running gradient check on " + trainSentences.size() + " trees");
dvparser.runGradientCheck(trainSentences, trainCompressedParses);
}
if (runTraining) {
log.info("Training the RNN parser");
log.info("Current train options: " + dvparser.getOp().trainOptions);
dvparser.train(trainSentences, trainCompressedParses, testTreebank, modelPath, resultsRecordPath);
if (modelPath != null) {
dvparser.saveModel(modelPath);
}
}
if (testTreebankPath != null) {
EvaluateTreebank evaluator = new EvaluateTreebank(dvparser.attachModelToLexicalizedParser());
evaluator.testOnTreebank(testTreebank);
}
log.info("Successfully ran DVParser");
}
use of java.io.FileFilter in project CoreNLP by stanfordnlp.
the class TreebankStats method run.
public void run(boolean pathsAreFiles, boolean displayWords, boolean displayOOV) {
if (useSplit) {
List<ObservedCorpusStats> allSplitStats = new ArrayList<>();
makeVocab = true;
for (Map.Entry<Split, Set<String>> split : splitFileLists.entrySet()) {
DiskTreebank tb = tlpp.diskTreebank();
FileFilter splitFilter = new SplitFilter(split.getValue());
for (String path : pathNames) tb.loadPath(path, splitFilter);
ObservedCorpusStats splitStats = gatherStats(tb, languageName.toString() + "." + split.getKey().toString());
allSplitStats.add(splitStats);
makeVocab = false;
}
display(aggregateStats(allSplitStats), displayWords, displayOOV);
for (ObservedCorpusStats ocs : allSplitStats) display(ocs, displayWords, displayOOV);
} else if (pathsAreFiles) {
makeVocab = true;
for (String path : pathNames) {
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(path, pathname -> true);
ObservedCorpusStats stats = gatherStats(tb, languageName.toString() + " " + path);
display(stats, displayWords, displayOOV);
makeVocab = false;
}
} else {
trainVocab = Generics.newHashSet();
DiskTreebank tb = tlpp.diskTreebank();
for (String path : pathNames) tb.loadPath(path, pathname -> !pathname.isDirectory());
ObservedCorpusStats allStats = gatherStats(tb, languageName.toString());
display(allStats, displayWords, displayOOV);
}
}
Aggregations