use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.
the class ChineseMaxentLexicon method main.
public static void main(String[] args) {
TreebankLangParserParams tlpParams = new ChineseTreebankParserParams();
TreebankLanguagePack ctlp = tlpParams.treebankLanguagePack();
Options op = new Options(tlpParams);
TreeAnnotator ta = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
log.info("Reading Trees...");
FileFilter trainFilter = new NumberRangesFileFilter(args[1], true);
Treebank trainTreebank = tlpParams.memoryTreebank();
trainTreebank.loadPath(args[0], trainFilter);
log.info("Annotating trees...");
Collection<Tree> trainTrees = new ArrayList<>();
for (Tree tree : trainTreebank) {
trainTrees.add(ta.transformTree(tree));
}
// saves memory
trainTreebank = null;
log.info("Training lexicon...");
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
int featureLevel = DEFAULT_FEATURE_LEVEL;
if (args.length > 3) {
featureLevel = Integer.parseInt(args[3]);
}
ChineseMaxentLexicon lex = new ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel);
lex.initializeTraining(trainTrees.size());
lex.train(trainTrees);
lex.finishTraining();
log.info("Testing");
FileFilter testFilter = new NumberRangesFileFilter(args[2], true);
Treebank testTreebank = tlpParams.memoryTreebank();
testTreebank.loadPath(args[0], testFilter);
List<TaggedWord> testWords = new ArrayList<>();
for (Tree t : testTreebank) {
for (TaggedWord tw : t.taggedYield()) {
testWords.add(tw);
}
//testWords.addAll(t.taggedYield());
}
int[] totalAndCorrect = lex.testOnTreebank(testWords);
log.info("done.");
System.out.println(totalAndCorrect[1] + " correct out of " + totalAndCorrect[0] + " -- ACC: " + ((double) totalAndCorrect[1]) / totalAndCorrect[0]);
}
use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.
the class ChineseTreebankParserParams method main.
/**
* For testing: loads a treebank and prints the trees.
*/
public static void main(String[] args) {
TreebankLangParserParams tlpp = new ChineseTreebankParserParams();
System.out.println("Default encoding is: " + tlpp.diskTreebank().encoding());
if (args.length < 2) {
printlnErr("Usage: edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams treesPath fileRange");
} else {
Treebank m = tlpp.diskTreebank();
m.loadPath(args[0], new NumberRangesFileFilter(args[1], false));
for (Tree t : m) {
t.pennPrint(tlpp.pw());
}
System.out.println("There were " + m.size() + " trees.");
}
}
use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.
the class ChineseLexiconAndWordSegmenter method main.
/** This method lets you train and test a segmenter relative to a
* Treebank.
* <p>
* <i>Implementation note:</i> This method is largely cloned from
* LexicalizedParser's main method. Should we try to have it be able
* to train segmenters to stop things going out of sync?
*/
public static void main(String[] args) {
boolean train = false;
boolean saveToSerializedFile = false;
boolean saveToTextFile = false;
String serializedInputFileOrUrl = null;
String textInputFileOrUrl = null;
String serializedOutputFileOrUrl = null;
String textOutputFileOrUrl = null;
String treebankPath = null;
Treebank testTreebank = null;
// Treebank tuneTreebank = null;
String testPath = null;
FileFilter testFilter = null;
FileFilter trainFilter = null;
String encoding = null;
// variables needed to process the files to be parsed
TokenizerFactory<Word> tokenizerFactory = null;
// DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
// whether or not the input file has already been tokenized
boolean tokenized = false;
Function<List<HasWord>, List<HasWord>> escaper = new ChineseEscaper();
// int tagDelimiter = -1;
// String sentenceDelimiter = "\n";
// boolean fromXML = false;
int argIndex = 0;
if (args.length < 1) {
log.info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
return;
}
Options op = new Options();
op.tlpParams = new ChineseTreebankParserParams();
// while loop through option arguments
while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
if (args[argIndex].equalsIgnoreCase("-train")) {
train = true;
saveToSerializedFile = true;
int numSubArgs = numSubArgs(args, argIndex);
argIndex++;
if (numSubArgs > 1) {
treebankPath = args[argIndex];
argIndex++;
} else {
throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
}
if (numSubArgs == 2) {
trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs >= 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
trainFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
trainFilter = new NumberRangesFileFilter(args[argIndex], true);
argIndex++;
}
}
} else if (args[argIndex].equalsIgnoreCase("-encoding")) {
// sets encoding for TreebankLangParserParams
encoding = args[argIndex + 1];
op.tlpParams.setInputEncoding(encoding);
op.tlpParams.setOutputEncoding(encoding);
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
// load the parser from a binary serialized file
// the next argument must be the path to the parser file
serializedInputFileOrUrl = args[argIndex + 1];
argIndex += 2;
// doesn't make sense to load from TextFile -pichuan
// } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
// // load the parser from declarative text file
// // the next argument must be the path to the parser file
// textInputFileOrUrl = args[argIndex + 1];
// argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
saveToSerializedFile = true;
serializedOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
// save the parser to declarative text file
saveToTextFile = true;
textOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-treebank")) {
// the next argument is the treebank path and range for testing
int numSubArgs = numSubArgs(args, argIndex);
argIndex++;
if (numSubArgs == 1) {
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs > 1) {
testPath = args[argIndex++];
if (numSubArgs == 2) {
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs >= 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
testFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
}
}
}
} else {
int j = op.tlpParams.setOptionFlag(args, argIndex);
if (j == argIndex) {
log.info("Unknown option ignored: " + args[argIndex]);
j++;
}
argIndex = j;
}
}
// end while loop through arguments
TreebankLangParserParams tlpParams = op.tlpParams;
// all other arguments are order dependent and
// are processed in order below
ChineseLexiconAndWordSegmenter cs = null;
if (!train && op.testOptions.verbose) {
System.out.println("Currently " + new Date());
printArgs(args, System.out);
}
if (train) {
printArgs(args, System.out);
// so we train a parser using the treebank
if (treebankPath == null) {
// the next arg must be the treebank path, since it wasn't give earlier
treebankPath = args[argIndex];
argIndex++;
if (args.length > argIndex + 1) {
try {
// the next two args might be the range
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
trainFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
trainFilter = new NumberRangesFileFilter(args[argIndex], true);
argIndex++;
}
}
}
Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
} else if (textInputFileOrUrl != null) {
// so we load the segmenter from a text grammar file
// XXXXX fix later -pichuan
//cs = new LexicalizedParser(textInputFileOrUrl, true, op);
} else {
// so we load a serialized segmenter
if (serializedInputFileOrUrl == null) {
// the next argument must be the path to the serialized parser
serializedInputFileOrUrl = args[argIndex];
argIndex++;
}
try {
cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
} catch (IllegalArgumentException e) {
log.info("Error loading segmenter, exiting...");
System.exit(0);
}
}
// the following has to go after reading parser to make sure
// op and tlpParams are the same for train and test
TreePrint treePrint = op.testOptions.treePrint(tlpParams);
if (testFilter != null) {
if (testPath == null) {
if (treebankPath == null) {
throw new RuntimeException("No test treebank path specified...");
} else {
log.info("No test treebank path specified. Using train path: \"" + treebankPath + "\"");
testPath = treebankPath;
}
}
testTreebank = tlpParams.testMemoryTreebank();
testTreebank.loadPath(testPath, testFilter);
}
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));
// -- Roger
if (op.testOptions.verbose) {
log.info("Lexicon is " + cs.getClass().getName());
}
PrintWriter pwOut = tlpParams.pw();
PrintWriter pwErr = tlpParams.pw(System.err);
// Now what do we do with the parser we've made
if (saveToTextFile) {
// save the parser to textGrammar format
if (textOutputFileOrUrl != null) {
saveSegmenterDataToText(cs, textOutputFileOrUrl);
} else {
log.info("Usage: must specify a text segmenter data output path");
}
}
if (saveToSerializedFile) {
if (serializedOutputFileOrUrl == null && argIndex < args.length) {
// the next argument must be the path to serialize to
serializedOutputFileOrUrl = args[argIndex];
argIndex++;
}
if (serializedOutputFileOrUrl != null) {
saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
} else if (textOutputFileOrUrl == null && testTreebank == null) {
// no saving/parsing request has been specified
log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
}
}
/* --------------------- Testing part!!!! ----------------------- */
if (op.testOptions.verbose) {
// printOptions(false, op);
}
if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
// test parser on treebank
if (testTreebank == null) {
// the next argument is the treebank path and range for testing
testTreebank = tlpParams.testMemoryTreebank();
if (args.length < argIndex + 4) {
testTreebank.loadPath(args[argIndex + 1]);
} else {
int testlow = Integer.parseInt(args[argIndex + 2]);
int testhigh = Integer.parseInt(args[argIndex + 3]);
testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
}
}
/* TODO - test segmenting on treebank. -pichuan */
// lp.testOnTreebank(testTreebank);
// } else if (argIndex >= args.length) {
// // no more arguments, so we just parse our own test sentence
// if (lp.parse(op.tlpParams.defaultTestSentence())) {
// treePrint.printTree(lp.getBestParse(), pwOut);
// } else {
// pwErr.println("Error. Can't parse test sentence: " +
// lp.parse(op.tlpParams.defaultTestSentence()));
// }
}
//wsg2010: This code block doesn't actually do anything. It appears to read and tokenize a file, and then just print it.
// There are easier ways to do that. This code was copied from an old version of LexicalizedParser.
// else {
// // We parse filenames given by the remaining arguments
// int numWords = 0;
// Timing timer = new Timing();
// // set the tokenizer
// if (tokenized) {
// tokenizerFactory = WhitespaceTokenizer.factory();
// }
// TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
// if (tokenizerFactory == null) {
// tokenizerFactory = (TokenizerFactory<Word>) tlp.getTokenizerFactory();
// }
// documentPreprocessor.setTokenizerFactory(tokenizerFactory);
// documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
// if (encoding != null) {
// documentPreprocessor.setEncoding(encoding);
// }
// timer.start();
// for (int i = argIndex; i < args.length; i++) {
// String filename = args[i];
// try {
// List document = null;
// if (fromXML) {
// document = documentPreprocessor.getSentencesFromXML(filename, sentenceDelimiter, tokenized);
// } else {
// document = documentPreprocessor.getSentencesFromText(filename, escaper, sentenceDelimiter, tagDelimiter);
// }
// log.info("Segmenting file: " + filename + " with " + document.size() + " sentences.");
// PrintWriter pwo = pwOut;
// if (op.testOptions.writeOutputFiles) {
// try {
// pwo = tlpParams.pw(new FileOutputStream(filename + ".stp"));
// } catch (IOException ioe) {
// ioe.printStackTrace();
// }
// }
// int num = 0;
// treePrint.printHeader(pwo, tlp.getEncoding());
// for (Iterator it = document.iterator(); it.hasNext();) {
// num++;
// List sentence = (List) it.next();
// int len = sentence.size();
// numWords += len;
//// pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + sentence);
// pwo.println(Sentence.listToString(sentence));
// }
// treePrint.printFooter(pwo);
// if (op.testOptions.writeOutputFiles) {
// pwo.close();
// }
// } catch (IOException e) {
// pwErr.println("Couldn't find file: " + filename);
// }
//
// } // end for each file
// long millis = timer.stop();
// double wordspersec = numWords / (((double) millis) / 1000);
// NumberFormat nf = new DecimalFormat("0.00"); // easier way!
// pwErr.println("Segmented " + numWords + " words at " + nf.format(wordspersec) + " words per second.");
// }
}
use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.
the class BaseLexicon method main.
/** Provides some testing and opportunities for exploration of the
* probabilities of a BaseLexicon. What's here currently probably
* only works for the English Penn Treeebank, as it uses default
* constructors. Of the words given to test on,
* the first is treated as sentence initial, and the rest as not
* sentence initial.
*
* @param args The command line arguments:
* java BaseLexicon treebankPath fileRange unknownWordModel words*
*/
public static void main(String[] args) {
if (args.length < 3) {
log.info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
return;
}
System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
Treebank tb = new DiskTreebank();
tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
// TODO: change this interface so the lexicon creates its own indices?
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
Options op = new Options();
op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
lex.initializeTraining(tb.size());
lex.train(tb);
lex.finishTraining();
System.out.println("done.");
System.out.println();
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(4);
List<String> impos = new ArrayList<>();
for (int i = 3; i < args.length; i++) {
if (lex.isKnown(args[i])) {
System.out.println(args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:");
for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
IntTaggedWord iTW = it.next();
System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
}
} else {
String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
impos.clear();
List<String> lis = new ArrayList<>(tagIndex.objectsList());
Collections.sort(lis);
for (String tStr : lis) {
IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
double score = lex.score(iTW, 1, args[i], null);
if (score == Float.NEGATIVE_INFINITY) {
impos.add(tStr);
} else {
System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
}
}
if (impos.size() > 0) {
System.out.println(args[i] + " impossible tags: " + impos);
}
}
System.out.println();
}
}
use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.
the class ArgUtils method getWeightedTreebankDescription.
public static Triple<String, FileFilter, Double> getWeightedTreebankDescription(String[] args, int argIndex, String flag) {
String path = null;
FileFilter filter = null;
Double weight = 1.0;
// the next arguments are the treebank path and maybe the range for testing
int numSubArgs = numSubArgs(args, argIndex);
if (numSubArgs > 0 && numSubArgs < 4) {
argIndex++;
path = args[argIndex++];
boolean hasWeight = false;
if (numSubArgs > 1 && DOUBLE_PATTERN.matcher(args[argIndex + numSubArgs - 2]).matches()) {
weight = Double.parseDouble(args[argIndex + numSubArgs - 2]);
hasWeight = true;
numSubArgs--;
}
if (numSubArgs == 2) {
filter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs == 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
filter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
filter = new NumberRangesFileFilter(args[argIndex++], true);
}
}
if (hasWeight) {
argIndex++;
}
} else {
throw new IllegalArgumentException("Bad arguments after " + flag);
}
return Triple.makeTriple(path, filter, weight);
}
Aggregations