use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class CRFClassifier method serializeClassifier.
/**
* {@inheritDoc}
*/
@Override
public void serializeClassifier(String serializePath) {
ObjectOutputStream oos = null;
try {
oos = IOUtils.writeStreamFromString(serializePath);
serializeClassifier(oos);
log.info("Serializing classifier to " + serializePath + "... done.");
} catch (Exception e) {
throw new RuntimeIOException("Serializing classifier to " + serializePath + "... FAILED", e);
} finally {
IOUtils.closeIgnoringExceptions(oos);
}
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class CRFClassifierNonlinear method serializeClassifier.
@Override
public void serializeClassifier(ObjectOutputStream oos) {
try {
super.serializeClassifier(oos);
oos.writeObject(nodeFeatureIndicesMap);
oos.writeObject(edgeFeatureIndicesMap);
if (flags.secondOrderNonLinear) {
oos.writeObject(inputLayerWeights4Edge);
oos.writeObject(outputLayerWeights4Edge);
} else {
oos.writeObject(linearWeights);
}
oos.writeObject(inputLayerWeights);
oos.writeObject(outputLayerWeights);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class SpanishTokenizer method main.
/**
* A fast, rule-based tokenizer for Spanish based on AnCora.
* Performs punctuation splitting and light tokenization by default.
* <p>
* Currently, this tokenizer does not do line splitting. It assumes that the input
* file is delimited by the system line separator. The output will be equivalently
* delimited.
*
* @param args Command-line arguments
*/
public static void main(String[] args) {
final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
if (options.containsKey("help")) {
log.info(usage());
return;
}
// Lexer options
final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
String orthoOptions = options.containsKey("ancora") ? ANCORA_OPTIONS : "";
if (options.containsKey("options")) {
orthoOptions = orthoOptions.isEmpty() ? options.getProperty("options") : orthoOptions + ',' + options;
}
final boolean tokens = PropertiesUtils.getBool(options, "tokens", false);
if (!tokens) {
orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
}
tf.setOptions(orthoOptions);
// Other options
final String encoding = options.getProperty("encoding", "UTF-8");
final boolean toLower = PropertiesUtils.getBool(options, "lowerCase", false);
final Locale es = new Locale("es");
boolean onePerLine = PropertiesUtils.getBool(options, "onePerLine", false);
// Read the file from stdin
int nLines = 0;
int nTokens = 0;
final long startTime = System.nanoTime();
try {
Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new BufferedReader(new InputStreamReader(System.in, encoding)));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, encoding));
boolean printSpace = false;
while (tokenizer.hasNext()) {
++nTokens;
String word = tokenizer.next().word();
if (word.equals(SpanishLexer.NEWLINE_TOKEN)) {
++nLines;
if (!onePerLine) {
writer.newLine();
printSpace = false;
}
} else {
String outputToken = toLower ? word.toLowerCase(es) : word;
if (onePerLine) {
writer.write(outputToken);
writer.newLine();
} else {
if (printSpace) {
writer.write(" ");
}
writer.write(outputToken);
printSpace = true;
}
}
}
} catch (UnsupportedEncodingException e) {
throw new RuntimeIOException("Bad character encoding", e);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
long elapsedTime = System.nanoTime() - startTime;
double linesPerSec = (double) nLines / (elapsedTime / 1e9);
System.err.printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class DependencyParser method writeModelFile.
public void writeModelFile(String modelFile) {
try {
float[][] W1 = classifier.getW1();
float[] b1 = classifier.getb1();
float[][] W2 = classifier.getW2();
float[][] E = classifier.getE();
Writer output = IOUtils.getPrintWriter(modelFile);
output.write("language=" + language + "\n");
output.write("tlp=" + config.tlp.getClass().getCanonicalName() + "\n");
output.write("dict=" + knownWords.size() + "\n");
output.write("pos=" + knownPos.size() + "\n");
output.write("label=" + knownLabels.size() + "\n");
output.write("embeddingSize=" + E[0].length + "\n");
output.write("hiddenSize=" + b1.length + "\n");
output.write("numTokens=" + (W1[0].length / E[0].length) + "\n");
output.write("preComputed=" + preComputed.size() + "\n");
int index = 0;
// First write word / POS / label embeddings
for (String word : knownWords) {
index = writeEmbedding(E[index], output, index, word);
}
for (String pos : knownPos) {
index = writeEmbedding(E[index], output, index, pos);
}
for (String label : knownLabels) {
index = writeEmbedding(E[index], output, index, label);
}
// Now write classifier weights
for (int j = 0; j < W1[0].length; ++j) for (int i = 0; i < W1.length; ++i) {
output.write(String.valueOf(W1[i][j]));
if (i == W1.length - 1)
output.write("\n");
else
output.write(" ");
}
for (int i = 0; i < b1.length; ++i) {
output.write(String.valueOf(b1[i]));
if (i == b1.length - 1)
output.write("\n");
else
output.write(" ");
}
for (int j = 0; j < W2[0].length; ++j) for (int i = 0; i < W2.length; ++i) {
output.write(String.valueOf(W2[i][j]));
if (i == W2.length - 1)
output.write("\n");
else
output.write(" ");
}
// Finish with pre-computation info
for (int i = 0; i < preComputed.size(); ++i) {
output.write(String.valueOf(preComputed.get(i)));
if ((i + 1) % 100 == 0 || i == preComputed.size() - 1)
output.write("\n");
else
output.write(" ");
}
output.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class DependencyParser method main.
/**
* A main program for training, testing and using the parser.
*
* <p>
* You can use this program to train new parsers from treebank data,
* evaluate on test treebank data, or parse raw text input.
*
* <p>
* Sample usages:
* <ul>
* <li>
* <strong>Train a parser with CoNLL treebank data:</strong>
* {@code java edu.stanford.nlp.parser.nndep.DependencyParser -trainFile trainPath -devFile devPath -embedFile wordEmbeddingFile -embeddingSize wordEmbeddingDimensionality -model modelOutputFile.txt.gz}
* </li>
* <li>
* <strong>Parse raw text from a file:</strong>
* {@code java edu.stanford.nlp.parser.nndep.DependencyParser -model modelOutputFile.txt.gz -textFile rawTextToParse -outFile dependenciesOutputFile.txt}
* </li>
* <li>
* <strong>Parse raw text from standard input, writing to standard output:</strong>
* {@code java edu.stanford.nlp.parser.nndep.DependencyParser -model modelOutputFile.txt.gz -textFile - -outFile -}
* </li>
* </ul>
*
* <p>
* See below for more information on all of these training / test options and more.
*
* <p>
* Input / output options:
* <table>
* <tr><th>Option</th><th>Required for training</th><th>Required for testing / parsing</th><th>Description</th></tr>
* <tr><td><tt>-devFile</tt></td><td>Optional</td><td>No</td><td>Path to a development-set treebank in <a href="http://ilk.uvt.nl/conll/#dataformat">CoNLL-X format</a>. If provided, the dev set performance is monitored during training.</td></tr>
* <tr><td><tt>-embedFile</tt></td><td>Optional (highly recommended!)</td><td>No</td><td>A word embedding file, containing distributed representations of English words. Each line of the provided file should contain a single word followed by the elements of the corresponding word embedding (space-delimited). It is not absolutely necessary that all words in the treebank be covered by this embedding file, though the parser's performance will generally improve if you are able to provide better embeddings for more words.</td></tr>
* <tr><td><tt>-model</tt></td><td>Yes</td><td>Yes</td><td>Path to a model file. If the path ends in <tt>.gz</tt>, the model will be read as a Gzipped model file. During training, we write to this path; at test time we read a pre-trained model from this path.</td></tr>
* <tr><td><tt>-textFile</tt></td><td>No</td><td>Yes (or <tt>testFile</tt>)</td><td>Path to a plaintext file containing sentences to be parsed.</td></tr>
* <tr><td><tt>-testFile</tt></td><td>No</td><td>Yes (or <tt>textFile</tt>)</td><td>Path to a test-set treebank in <a href="http://ilk.uvt.nl/conll/#dataformat">CoNLL-X format</a> for final evaluation of the parser.</td></tr>
* <tr><td><tt>-trainFile</tt></td><td>Yes</td><td>No</td><td>Path to a training treebank in <a href="http://ilk.uvt.nl/conll/#dataformat">CoNLL-X format.</a></td></tr>
* </table>
*
* Training options:
* <table>
* <tr><th>Option</th><th>Default</th><th>Description</th></tr>
* <tr><td><tt>-adaAlpha</tt></td><td>0.01</td><td>Global learning rate for AdaGrad training</td></tr>
* <tr><td><tt>-adaEps</tt></td><td>1e-6</td><td>Epsilon value added to the denominator of AdaGrad update expression for numerical stability</td></tr>
* <tr><td><tt>-batchSize</tt></td><td>10000</td><td>Size of mini-batch used for training</td></tr>
* <tr><td><tt>-clearGradientsPerIter</tt></td><td>0</td><td>Clear AdaGrad gradient histories every <em>n</em> iterations. If zero, no gradient clearing is performed.</td></tr>
* <tr><td><tt>-dropProb</tt></td><td>0.5</td><td>Dropout probability. For each training example we randomly choose some amount of units to disable in the neural network classifier. This parameter controls the proportion of units "dropped out."</td></tr>
* <tr><td><tt>-embeddingSize</tt></td><td>50</td><td>Dimensionality of word embeddings provided</td></tr>
* <tr><td><tt>-evalPerIter</tt></td><td>100</td><td>Run full UAS (unlabeled attachment score) evaluation every time we finish this number of iterations. (Only valid if a development treebank is provided with <tt>-devFile</tt>.)</td></tr>
* <tr><td><tt>-hiddenSize</tt></td><td>200</td><td>Dimensionality of hidden layer in neural network classifier</td></tr>
* <tr><td><tt>-initRange</tt></td><td>0.01</td><td>Bounds of range within which weight matrix elements should be initialized. Each element is drawn from a uniform distribution over the range <tt>[-initRange, initRange]</tt>.</td></tr>
* <tr><td><tt>-maxIter</tt></td><td>20000</td><td>Number of training iterations to complete before stopping and saving the final model.</td></tr>
* <tr><td><tt>-numPreComputed</tt></td><td>100000</td><td>The parser pre-computes hidden-layer unit activations for particular inputs words at both training and testing time in order to speed up feedforward computation in the neural network. This parameter determines how many words for which we should compute hidden-layer activations.</td></tr>
* <tr><td><tt>-regParameter</tt></td><td>1e-8</td><td>Regularization parameter for training</td></tr>
* <tr><td><tt>-saveIntermediate</tt></td><td><tt>true</tt></td><td>If <tt>true</tt>, continually save the model version which gets the highest UAS value on the dev set. (Only valid if a development treebank is provided with <tt>-devFile</tt>.)</td></tr>
* <tr><td><tt>-trainingThreads</tt></td><td>1</td><td>Number of threads to use during training. Note that depending on training batch size, it may be unwise to simply choose the maximum amount of threads for your machine. On our 16-core test machines: a batch size of 10,000 runs fastest with around 6 threads; a batch size of 100,000 runs best with around 10 threads.</td></tr>
* <tr><td><tt>-wordCutOff</tt></td><td>1</td><td>The parser can optionally ignore rare words by simply choosing an arbitrary "unknown" feature representation for words that appear with frequency less than <em>n</em> in the corpus. This <em>n</em> is controlled by the <tt>wordCutOff</tt> parameter.</td></tr>
* </table>
*
* Runtime parsing options:
* <table>
* <tr><th>Option</th><th>Default</th><th>Description</th></tr>
* <tr><td><tt>-escaper</tt></td><td>N/A</td><td>Only applicable for testing with <tt>-textFile</tt>. If provided, use this word-escaper when parsing raw sentences. Should be a fully-qualified class name like <tt>edu.stanford.nlp.trees.international.arabic.ATBEscaper</tt>.</td></tr>
* <tr><td><tt>-numPreComputed</tt></td><td>100000</td><td>The parser pre-computes hidden-layer unit activations for particular inputs words at both training and testing time in order to speed up feedforward computation in the neural network. This parameter determines how many words for which we should compute hidden-layer activations.</td></tr>
* <tr><td><tt>-sentenceDelimiter</tt></td><td>N/A</td><td>Only applicable for testing with <tt>-textFile</tt>. If provided, assume that the given <tt>textFile</tt> has already been sentence-split, and that sentences are separated by this delimiter.</td></tr>
* <tr><td><tt>-tagger.model</tt></td><td>edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger</td><td>Only applicable for testing with <tt>-textFile</tt>. Path to a part-of-speech tagger to use to pre-tag the raw sentences before parsing.</td></tr>
* </table>
*/
public static void main(String[] args) {
Properties props = StringUtils.argsToProperties(args, numArgs);
DependencyParser parser = new DependencyParser(props);
// Train with CoNLL-X data
if (props.containsKey("trainFile")) {
parser.train(props.getProperty("trainFile"), props.getProperty("devFile"), props.getProperty("model"), props.getProperty("embedFile"), props.getProperty("preModel"));
}
boolean loaded = false;
// Test with CoNLL-X data
if (props.containsKey("testFile")) {
parser.loadModelFile(props.getProperty("model"));
loaded = true;
parser.testCoNLL(props.getProperty("testFile"), props.getProperty("outFile"));
}
// Parse raw text data
if (props.containsKey("textFile")) {
if (!loaded) {
parser.loadModelFile(props.getProperty("model"));
loaded = true;
}
String encoding = parser.config.tlp.getEncoding();
String inputFilename = props.getProperty("textFile");
BufferedReader input;
try {
input = inputFilename.equals("-") ? IOUtils.readerFromStdin(encoding) : IOUtils.readerFromString(inputFilename, encoding);
} catch (IOException e) {
throw new RuntimeIOException("No input file provided (use -textFile)", e);
}
String outputFilename = props.getProperty("outFile");
PrintWriter output;
try {
output = outputFilename == null || outputFilename.equals("-") ? IOUtils.encodedOutputStreamPrintWriter(System.out, encoding, true) : IOUtils.getPrintWriter(outputFilename, encoding);
} catch (IOException e) {
throw new RuntimeIOException("Error opening output file", e);
}
parser.parseTextFile(input, output);
}
}
Aggregations