use of org.cogcomp.DatastoreException in project cogcomp-nlp by CogComp.
the class ModelLoader method load.
/**
* Load the models wherever they are found. Check file system first, then classpath, and finally get it
* from Minio datastore.
* @param rm the resource manager.
* @param training if we are training.
* @param viewName the name of the view identifies the model.
* @param cp the parameters for the calling model.
*/
public static void load(ResourceManager rm, String viewName, boolean training, ParametersForLbjCode cp) {
// the loaded built into the model will check the local file system and the jar files in the classpath.
String modelPath = cp.pathToModelFile;
String modelFilePath = modelPath + ".level1";
java.io.File modelFile = new File(modelFilePath);
NETaggerLevel1 tagger1 = null;
NETaggerLevel2 tagger2 = null;
if (modelFile.exists()) {
tagger1 = new NETaggerLevel1(modelPath + ".level1", modelPath + ".level1.lex");
logger.info("Reading L1 model from file : " + modelPath + ".level2");
if (cp.featuresToUse.containsKey("PredictionsLevel1")) {
tagger2 = new NETaggerLevel2(modelPath + ".level2", modelPath + ".level2.lex");
logger.info("Reading L2 model from file : " + modelPath + ".level2");
} else {
logger.info("L2 model not required.");
}
} else if (IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFilePath)) {
tagger1 = new NETaggerLevel1(modelPath + ".level1", modelPath + ".level1.lex");
logger.info("Reading L1 model from classpath : " + modelPath + ".level2");
if (cp.featuresToUse.containsKey("PredictionsLevel1")) {
tagger2 = new NETaggerLevel2(modelPath + ".level2", modelPath + ".level2.lex");
logger.info("Reading L2 model from classpath : " + modelPath + ".level2");
} else {
logger.info("L2 model not required.");
}
} else if (training) {
// we are training a new model, so it it doesn't exist, we don't care, just create a
// container.
tagger1 = new NETaggerLevel1(modelPath + ".level1", modelPath + ".level1.lex");
logger.info("Reading L1 model from file : " + modelPath + ".level2");
if (cp.featuresToUse.containsKey("PredictionsLevel1")) {
tagger2 = new NETaggerLevel2(modelPath + ".level2", modelPath + ".level2.lex");
logger.info("Reading L2 model from file : " + modelPath + ".level2");
} else {
logger.info("L2 model not required.");
}
} else {
// all else has filed, load from the datastore, create artifact ids based on the view
// name and training data designation.
String dataset;
String lowercaseViewName = viewName.toLowerCase();
if (lowercaseViewName.contains(ViewNames.NER_CONLL.toLowerCase())) {
dataset = "enron-conll";
} else if (lowercaseViewName.contains(ViewNames.NER_ONTONOTES.toLowerCase())) {
dataset = "ontonotes";
} else {
// not a standard model, and we can't find it on the command line.
throw new IllegalArgumentException("The NER models could not be found at \"" + modelPath + "\", and no default with view name " + viewName);
}
String data_split;
if (!rm.containsKey(NerBaseConfigurator.TRAINED_ON))
data_split = NerBaseConfigurator.TRAINED_ON_ALL_DATA;
else
data_split = rm.getString(NerBaseConfigurator.TRAINED_ON);
try {
Datastore ds = new Datastore(new ResourceConfigurator().getConfig(rm));
String artifact_id = "ner-model-" + dataset + "-" + data_split;
File modelDir = ds.getDirectory("edu.illinois.cs.cogcomp.ner", artifact_id, 4.0, false);
String model = "";
if (modelDir.getPath().contains("conll")) {
model = modelDir.getPath() + "/model/EnronCoNLL.model";
} else {
model = modelDir.getPath() + "/model/OntoNotes.model";
}
tagger1 = new NETaggerLevel1(model + ".level1", model + ".level1.lex");
if (cp.featuresToUse.containsKey("PredictionsLevel1")) {
tagger2 = new NETaggerLevel2(model + ".level2", model + ".level2.lex");
}
} catch (InvalidPortException | DatastoreException | InvalidEndpointException e) {
e.printStackTrace();
}
}
cp.taggerLevel1 = tagger1;
cp.taggerLevel2 = tagger2;
}
use of org.cogcomp.DatastoreException in project cogcomp-nlp by CogComp.
the class BrownClusters method init.
/**
* Initialze the brown cluster data. This is a singleton, so this process is sychronized and
* atomic with resprect to the <code>get()</code> method above.
* @param pathsToClusterFiles the files containing the data.
* @param thresholds
* @param isLowercaseBrownClusters
*/
public static void init(Vector<String> pathsToClusterFiles, Vector<Integer> thresholds, Vector<Boolean> isLowercaseBrownClusters) {
try {
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false);
synchronized (INIT_SYNC) {
brownclusters = new BrownClusters();
brownclusters.isLowercaseBrownClustersByResource = new boolean[isLowercaseBrownClusters.size()];
brownclusters.wordToPathByResource = new ArrayList<>();
brownclusters.resources = new ArrayList<>();
for (int i = 0; i < pathsToClusterFiles.size(); i++) {
THashMap<String, String> h = new THashMap<>();
// We used to access the files as resources. Now we are accessing them programmatically.
// InFile in = new InFile(ResourceUtilities.loadResource(pathsToClusterFiles.elementAt(i)));
InputStream is = new FileInputStream(gazDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i));
InFile in = new InFile(is);
String line = in.readLine();
int wordsAdded = 0;
while (line != null) {
StringTokenizer st = new StringTokenizer(line);
String path = st.nextToken();
String word = st.nextToken();
int occ = Integer.parseInt(st.nextToken());
if (occ >= thresholds.elementAt(i)) {
h.put(word, path);
wordsAdded++;
}
line = in.readLine();
}
if (ParametersForLbjCode.currentParameters.debug) {
logger.info(wordsAdded + " words added");
}
brownclusters.wordToPathByResource.add(h);
brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i);
brownclusters.resources.add(pathsToClusterFiles.elementAt(i));
in.close();
}
}
} catch (InvalidPortException | InvalidEndpointException | DatastoreException | FileNotFoundException e) {
e.printStackTrace();
}
}
use of org.cogcomp.DatastoreException in project cogcomp-nlp by CogComp.
the class TreeGazetteers method init.
/**
* init all the gazetters, mangle each term in a variety of ways.
*
* @param pathToDictionaries
* @param phrase_length the max length of the phrases we will consider.
* @throws IOException
*/
private void init(int phrase_length, String pathToDictionaries) throws IOException {
try {
ArrayList<String> filenames = new ArrayList<>();
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.5, false);
// We are not loading the resources from classpath anymore. Instead we are calling them programmatically
// InputStream stream = ResourceUtilities.loadResource(pathToDictionaries + "/gazetteers-list.txt");
InputStream stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(stream));
String line;
while ((line = br.readLine()) != null) filenames.add(line);
// init the dictionaries.
dictionaries = new ArrayList<>(filenames.size());
dictionariesIgnoreCase = new ArrayList<>(filenames.size());
GazetteerTree gaz = new GazetteerTree(phrase_length);
GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {
@Override
public String[] split(String line) {
String tmp = line.toLowerCase();
if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
return new String[0];
else {
// character tokenization for Chinese
if (ParametersForLbjCode.currentParameters.language == Language.Chinese) {
String[] chars = new String[line.length()];
for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
return chars;
} else
return normalize(line).split("[\\s]+");
}
}
@Override
public String normalize(String term) {
return term.toLowerCase();
}
});
// for each dictionary, compile each of the gaz trees for each phrase permutation.
for (String file : filenames) {
String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
}
gaz.trimToSize();
gazIC.trimToSize();
dictionaries.add(gaz);
dictionariesIgnoreCase.add(gazIC);
if (ParametersForLbjCode.currentParameters.debug) {
logger.info("found " + dictionaries.size() + " gazetteers");
}
} catch (InvalidPortException | InvalidEndpointException e) {
e.printStackTrace();
} catch (DatastoreException e) {
e.printStackTrace();
}
}
use of org.cogcomp.DatastoreException in project cogcomp-nlp by CogComp.
the class TransliterationAnnotator method initialize.
@Override
public void initialize(ResourceManager rm) {
try {
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File f = dsNoCredentials.getDirectory("org.cogcomp.transliteration", "transliteration-models", 1.3, false);
String modelPath = f.getAbsolutePath() + File.separator + "transliteration-models-oct-2017" + File.separator + "probs-" + lang.getCode() + ".txt";
if (new File(modelPath).exists()) {
logger.info("Loading transliteration models for language: " + lang + " from " + modelPath);
model = new SPModel(modelPath);
model.setMaxCandidates(1);
} else {
logger.error("Model for language: " + lang + " don't exist: " + modelPath);
}
} catch (IOException | InvalidEndpointException | DatastoreException | InvalidPortException e) {
e.printStackTrace();
}
}
use of org.cogcomp.DatastoreException in project cogcomp-nlp by CogComp.
the class WordSim method getFile.
public File getFile(String method) {
try {
ResourceManager rm = new ResourceConfigurator().getDefaultConfig();
ds = new Datastore(rm.getString("datastoreEndpoint"));
} catch (DatastoreException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
File f = null;
if (method.equals(EmbeddingConstant.word2vec)) {
try {
f = ds.getFile("org.cogcomp.wordembedding", "word2vec.txt", 1.5);
} catch (DatastoreException e) {
e.printStackTrace();
}
} else if (method.equals(EmbeddingConstant.glove)) {
try {
f = ds.getFile("org.cogcomp.wordembedding", "glove.txt", 1.5);
} catch (DatastoreException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (method.equals(EmbeddingConstant.phrase2vec)) {
try {
f = ds.getFile("org.cogcomp.wordembedding", "phrase2vec.txt", 1.5);
} catch (DatastoreException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (method.equals(EmbeddingConstant.memorybasedESA)) {
try {
f = ds.getFile("org.cogcomp.wordembedding", "memorybasedESA.txt", 1.5);
} catch (DatastoreException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (method.equals(EmbeddingConstant.pageIDMapping)) {
try {
f = ds.getFile("org.cogcomp.wordembedding", "pageIDMapping.txt", 1.5);
} catch (DatastoreException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return f;
}
Aggregations