use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class CorpusDictionary method readDict.
private static Set<String> readDict(String filename, boolean normalize) {
Set<String> word = Generics.newHashSet();
logger.info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);
try (InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename)) {
BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
int i = 0;
for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) {
i++;
// String[] fields = wordDetectorLine.split(" ");
// logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
int origLeng = wordDetectorLine.length();
wordDetectorLine = wordDetectorLine.trim();
int newLeng = wordDetectorLine.length();
if (newLeng != origLeng) {
EncodingPrintWriter.err.println("Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8");
}
if (newLeng == 0) {
EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8");
} else {
if (normalize) {
wordDetectorLine = ChineseUtils.normalize(wordDetectorLine, ChineseUtils.ASCII, ChineseUtils.ASCII, ChineseUtils.NORMALIZE);
}
word.add(wordDetectorLine);
}
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return word;
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class CorpusChar method readDict.
private static Map<String, Set<String>> readDict(String filename) {
Map<String, Set<String>> char_dict;
try {
BufferedReader detectorReader = IOUtils.readerFromString(filename, "UTF-8");
char_dict = Generics.newHashMap();
// logger.debug("DEBUG: in CorpusChar readDict");
for (String detectorLine; (detectorLine = detectorReader.readLine()) != null; ) {
String[] fields = detectorLine.split(" ");
String tag = fields[0];
Set<String> chars = char_dict.get(tag);
if (chars == null) {
chars = Generics.newHashSet();
char_dict.put(tag, chars);
}
// logger.debug("DEBUG: CorpusChar: "+filename+" "+fields[1]);
chars.add(fields[1]);
}
detectorReader.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
logger.info("Loading character dictionary file from " + filename + " [done].");
return char_dict;
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class MaxentTagger method readModelAndInit.
/**
* This reads the complete tagger from a single model file, and inits
* the tagger using a combination of the properties passed in and
* parameters from the file.
* <br>
* <i>Note for the future: This assumes that the TaggerConfig in the file
* has already been read and used. It might be better to refactor
* things so that is all done inside this method, but for the moment
* it seemed better to leave working code alone [cdm 2008].</i>
*
* @param config The tagger config
* @param rf DataInputStream to read from. It's the caller's job to open and close this stream.
* @param printLoading Whether to print a message saying what model file is being loaded and how long it took when finished.
* @throws RuntimeIOException if I/O errors or serialization errors
*/
protected void readModelAndInit(Properties config, DataInputStream rf, boolean printLoading) {
try {
Timing t = new Timing();
String source = null;
if (printLoading) {
if (config != null) {
// TODO: "model"
source = config.getProperty("model");
}
if (source == null) {
source = "data stream";
}
}
TaggerConfig taggerConfig = TaggerConfig.readConfig(rf);
if (config != null) {
taggerConfig.setProperties(config);
}
// then init tagger
init(taggerConfig);
xSize = rf.readInt();
ySize = rf.readInt();
// dict = new Dictionary(); // this method is called in constructor, and it's initialized as empty already
dict.read(rf);
if (VERBOSE) {
log.info("Tagger dictionary read.");
}
tags.read(rf);
readExtractors(rf);
dict.setAmbClasses(ambClasses, veryCommonWordThresh, tags);
int[] numFA = new int[extractors.size() + extractorsRare.size()];
int sizeAssoc = rf.readInt();
fAssociations = Generics.newArrayList();
for (int i = 0; i < extractors.size() + extractorsRare.size(); ++i) {
fAssociations.add(Generics.newHashMap());
}
if (VERBOSE)
log.infof("Reading %d feature keys...%n", sizeAssoc);
PrintFile pfVP = null;
if (VERBOSE) {
pfVP = new PrintFile("pairs.txt");
}
// reused in for loop but not stored. just a temp variable
FeatureKey fK = new FeatureKey();
for (int i = 0; i < sizeAssoc; i++) {
int numF = rf.readInt();
fK.read(rf);
numFA[fK.num]++;
if (VERBOSE) {
String eName = (fK.num < extractors.size() ? extractors.get(fK.num) : extractorsRare.get(fK.num - extractors.size())).toString();
Map<String, int[]> valFeats = fAssociations.get(fK.num);
pfVP.print(eName);
pfVP.print(' ');
pfVP.print(fK);
pfVP.print(' ');
if (valFeats != null) {
pfVP.print(valFeats.keySet());
}
pfVP.println();
}
// TODO: rewrite the writing / reading code to store fAssociations in a cleaner manner?
// Only do this when rebuilding all the tagger models anyway. When we do that, we can get rid of FeatureKey
Map<String, int[]> fValueAssociations = fAssociations.get(fK.num);
int[] fTagAssociations = fValueAssociations.get(fK.val);
if (fTagAssociations == null) {
fTagAssociations = new int[ySize];
for (int j = 0; j < ySize; ++j) {
fTagAssociations[j] = -1;
}
fValueAssociations.put(fK.val, fTagAssociations);
}
fTagAssociations[tags.getIndex(fK.tag)] = numF;
}
if (VERBOSE) {
IOUtils.closeIgnoringExceptions(pfVP);
}
if (VERBOSE) {
for (int k = 0; k < numFA.length; k++) {
log.info("Number of features of kind " + k + ' ' + (k < extractors.size() ? extractors.get(k) : extractorsRare.get(k - extractors.size())) + ": " + numFA[k]);
}
}
prob = new LambdaSolveTagger(rf);
if (VERBOSE) {
log.info("prob read ");
}
if (printLoading) {
t.done(log, "Loading POS tagger from " + source);
}
} catch (IOException | ClassNotFoundException e) {
throw new RuntimeIOException("Error while loading a tagger model (probably missing model file)", e);
}
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class TagCount method readTagCount.
/**
* A TagCount object's fields are read from the file. They are read from
* the current position and the file is not closed afterwards.
*/
public static TagCount readTagCount(DataInputStream rf, Interner<String> interner) {
try {
TagCount tc = new TagCount();
int numTags = rf.readInt();
tc.map = Generics.newHashMap(numTags);
for (int i = 0; i < numTags; i++) {
String tag = rf.readUTF();
int count = rf.readInt();
if (tag.equals(NULL_SYMBOL))
tag = null;
else
tag = interner.intern(tag);
tc.map.put(tag, count);
}
tc.getTagsCache = tc.map.keySet().toArray(new String[tc.map.keySet().size()]);
tc.sumCache = tc.calculateSumCache();
return tc;
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.
the class TTags method save.
protected void save(String filename, Map<String, Set<String>> tagTokens) {
try {
DataOutputStream out = IOUtils.getDataOutputStream(filename);
save(out, tagTokens);
out.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
Aggregations