use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class DependencyParser method setupClassifierForTraining.
/**
* Prepare a classifier for training with the given dataset.
*/
private void setupClassifierForTraining(List<CoreMap> trainSents, List<DependencyTree> trainTrees, String embedFile, String preModel) {
float[][] E = new float[knownWords.size() + knownPos.size() + knownLabels.size()][config.embeddingSize];
float[][] W1 = new float[config.hiddenSize][config.embeddingSize * config.numTokens];
float[] b1 = new float[config.hiddenSize];
float[][] W2 = new float[system.numTransitions()][config.hiddenSize];
// Randomly initialize weight matrices / vectors
Random random = Util.getRandom();
for (int i = 0; i < W1.length; ++i) for (int j = 0; j < W1[i].length; ++j) W1[i][j] = (float) (random.nextDouble() * 2 * config.initRange - config.initRange);
for (int i = 0; i < b1.length; ++i) b1[i] = (float) (random.nextDouble() * 2 * config.initRange - config.initRange);
for (int i = 0; i < W2.length; ++i) for (int j = 0; j < W2[i].length; ++j) W2[i][j] = (float) (random.nextDouble() * 2 * config.initRange - config.initRange);
// Read embeddings into `embedID`, `embeddings`
Map<String, Integer> embedID = new HashMap<>();
double[][] embeddings = readEmbedFile(embedFile, embedID);
// Try to match loaded embeddings with words in dictionary
int foundEmbed = 0;
for (int i = 0; i < E.length; ++i) {
int index = -1;
if (i < knownWords.size()) {
String str = knownWords.get(i);
// NOTE: exact match first, and then try lower case..
if (embedID.containsKey(str))
index = embedID.get(str);
else if (embedID.containsKey(str.toLowerCase()))
index = embedID.get(str.toLowerCase());
}
if (index >= 0) {
++foundEmbed;
for (int j = 0; j < E[i].length; ++j) {
E[i][j] = (float) embeddings[index][j];
}
} else {
for (int j = 0; j < E[i].length; ++j) {
// E[i][j] = random.nextDouble() * config.initRange * 2 - config.initRange;
// E[i][j] = random.nextDouble() * 0.2 - 0.1;
// E[i][j] = random.nextGaussian() * Math.sqrt(0.1);
E[i][j] = (float) (random.nextDouble() * 0.02 - 0.01);
}
}
}
log.info("Found embeddings: " + foundEmbed + " / " + knownWords.size());
if (preModel != null) {
try (BufferedReader input = IOUtils.readerFromString(preModel)) {
log.info("Loading pre-trained model file: " + preModel + " ... ");
String s;
s = input.readLine();
int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nLabel = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
String[] splits;
for (int k = 0; k < nDict; ++k) {
s = input.readLine();
splits = s.split(" ");
if (wordIDs.containsKey(splits[0]) && eSize == config.embeddingSize) {
int index = getWordID(splits[0]);
for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
}
}
for (int k = 0; k < nPOS; ++k) {
s = input.readLine();
splits = s.split(" ");
if (posIDs.containsKey(splits[0]) && eSize == config.embeddingSize) {
int index = getPosID(splits[0]);
for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
}
}
for (int k = 0; k < nLabel; ++k) {
s = input.readLine();
splits = s.split(" ");
if (labelIDs.containsKey(splits[0]) && eSize == config.embeddingSize) {
int index = getLabelID(splits[0]);
for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
}
}
boolean copyLayer1 = hSize == config.hiddenSize && config.embeddingSize == eSize && config.numTokens == nTokens;
if (copyLayer1) {
log.info("Copying parameters W1 && b1...");
}
for (int j = 0; j < eSize * nTokens; ++j) {
s = input.readLine();
if (copyLayer1) {
splits = s.split(" ");
for (int i = 0; i < hSize; ++i) W1[i][j] = Float.parseFloat(splits[i]);
}
}
s = input.readLine();
if (copyLayer1) {
splits = s.split(" ");
for (int i = 0; i < hSize; ++i) b1[i] = Float.parseFloat(splits[i]);
}
boolean copyLayer2 = (nLabel * 2 - 1 == system.numTransitions()) && hSize == config.hiddenSize;
if (copyLayer2)
log.info("Copying parameters W2...");
for (int j = 0; j < hSize; ++j) {
s = input.readLine();
if (copyLayer2) {
splits = s.split(" ");
for (int i = 0; i < nLabel * 2 - 1; ++i) W2[i][j] = Float.parseFloat(splits[i]);
}
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
Dataset trainSet = genTrainExamples(trainSents, trainTrees);
classifier = new Classifier(config, trainSet, E, W1, b1, W2, preComputed);
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class DependencyParser method loadModelFile.
void loadModelFile(String modelFile, boolean verbose) {
Timing t = new Timing();
try (BufferedReader input = IOUtils.readerFromString(modelFile)) {
// first line in newer saved models is language, legacy models don't store this
String s = input.readLine();
// check if language was stored
if (isModelNewFormat(s)) {
// set up language
config.language = Config.getLanguage(s.substring(9, s.length() - 1));
// set up tlp
s = input.readLine();
String tlpCanonicalName = s.substring(4);
try {
config.tlp = ReflectionLoading.loadByReflection(tlpCanonicalName);
log.info("Loaded TreebankLanguagePack: " + tlpCanonicalName);
} catch (Exception e) {
log.warn("Error: Failed to load TreebankLanguagePack: " + tlpCanonicalName);
}
s = input.readLine();
}
int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nLabel = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1));
s = input.readLine();
int nPreComputed = Integer.parseInt(s.substring(s.indexOf('=') + 1));
knownWords = new ArrayList<>();
knownPos = new ArrayList<>();
knownLabels = new ArrayList<>();
float[][] E = new float[nDict + nPOS + nLabel][eSize];
String[] splits;
int index = 0;
for (int k = 0; k < nDict; ++k) {
s = input.readLine();
splits = s.split(" ");
knownWords.add(splits[0]);
for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
index = index + 1;
}
for (int k = 0; k < nPOS; ++k) {
s = input.readLine();
splits = s.split(" ");
knownPos.add(splits[0]);
for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
index = index + 1;
}
for (int k = 0; k < nLabel; ++k) {
s = input.readLine();
splits = s.split(" ");
knownLabels.add(splits[0]);
for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
index = index + 1;
}
generateIDs();
float[][] W1 = new float[hSize][eSize * nTokens];
for (int j = 0; j < W1[0].length; ++j) {
s = input.readLine();
splits = s.split(" ");
for (int i = 0; i < W1.length; ++i) W1[i][j] = Float.parseFloat(splits[i]);
}
float[] b1 = new float[hSize];
s = input.readLine();
splits = s.split(" ");
for (int i = 0; i < b1.length; ++i) b1[i] = Float.parseFloat(splits[i]);
float[][] W2 = new float[nLabel * 2 - 1][hSize];
for (int j = 0; j < W2[0].length; ++j) {
s = input.readLine();
splits = s.split(" ");
for (int i = 0; i < W2.length; ++i) W2[i][j] = Float.parseFloat(splits[i]);
}
preComputed = new ArrayList<>();
while (preComputed.size() < nPreComputed) {
s = input.readLine();
splits = s.split(" ");
for (String split : splits) {
preComputed.add(Integer.parseInt(split));
}
}
config.hiddenSize = hSize;
config.embeddingSize = eSize;
log.debug("Read in dep parse matrices:");
log.debug(" E: " + E.length * E[0].length);
log.debug(" b1: " + b1.length);
log.debug(" W1: " + W1.length * W1[0].length);
log.debug(" W2: " + W2.length * W2[0].length);
classifier = new Classifier(config, E, W1, b1, W2, preComputed);
t.report(log, "Loading depparse model: " + modelFile);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
// initialize the loaded parser
initialize(verbose);
t.done(log, "Initializing dependency parser");
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class Util method writeConllFile.
public static void writeConllFile(String outFile, List<CoreMap> sentences, List<DependencyTree> trees) {
try {
PrintWriter output = IOUtils.getPrintWriter(outFile);
for (int i = 0; i < sentences.size(); i++) {
CoreMap sentence = sentences.get(i);
DependencyTree tree = trees.get(i);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int j = 1, size = tokens.size(); j <= size; ++j) {
CoreLabel token = tokens.get(j - 1);
output.printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.word(), token.tag(), token.tag(), tree.getHead(j), tree.getLabel(j));
}
output.println();
}
output.close();
} catch (Exception e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class PTBTokenizer method ptb2Text.
/**
* Returns a presentable version of the given PTB-tokenized text.
* PTB tokenization splits up punctuation and does various other things
* that makes simply joining the tokens with spaces look bad. So join
* the tokens with space and run it through this method to produce nice
* looking text. It's not perfect, but it works pretty well.
* <p>
* <b>Note:</b> If your tokens have maintained the OriginalTextAnnotation and
* the BeforeAnnotation and the AfterAnnotation, then rather than doing
* this you can actually precisely reconstruct the text they were made
* from!
*
* @param ptbText A String in PTB3-escaped form
* @return An approximation to the original String
*/
public static String ptb2Text(String ptbText) {
// probably an overestimate
StringBuilder sb = new StringBuilder(ptbText.length());
PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
try {
for (String token; (token = lexer.next()) != null; ) {
sb.append(token);
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return sb.toString();
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class ReadSentimentDataset method writeTrees.
private static void writeTrees(String filename, List<Tree> trees, List<Integer> treeIds) {
try {
FileOutputStream fos = new FileOutputStream(filename);
BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(fos));
for (Integer id : treeIds) {
bout.write(trees.get(id).toString());
bout.write("\n");
}
bout.flush();
fos.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
Aggregations