use of edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator in project cogcomp-nlp by CogComp.
the class BrownClusters method init.
/**
* Initialze the brown cluster data. This is a singleton, so this process is sychronized and
* atomic with resprect to the <code>get()</code> method above.
* @param pathsToClusterFiles the files containing the data.
* @param thresholds
* @param isLowercaseBrownClusters
*/
public static void init(Vector<String> pathsToClusterFiles, Vector<Integer> thresholds, Vector<Boolean> isLowercaseBrownClusters) {
try {
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false);
synchronized (INIT_SYNC) {
brownclusters = new BrownClusters();
brownclusters.isLowercaseBrownClustersByResource = new boolean[isLowercaseBrownClusters.size()];
brownclusters.wordToPathByResource = new ArrayList<>();
brownclusters.resources = new ArrayList<>();
for (int i = 0; i < pathsToClusterFiles.size(); i++) {
THashMap<String, String> h = new THashMap<>();
// We used to access the files as resources. Now we are accessing them programmatically.
// InFile in = new InFile(ResourceUtilities.loadResource(pathsToClusterFiles.elementAt(i)));
InputStream is = new FileInputStream(gazDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i));
InFile in = new InFile(is);
String line = in.readLine();
int wordsAdded = 0;
while (line != null) {
StringTokenizer st = new StringTokenizer(line);
String path = st.nextToken();
String word = st.nextToken();
int occ = Integer.parseInt(st.nextToken());
if (occ >= thresholds.elementAt(i)) {
h.put(word, path);
wordsAdded++;
}
line = in.readLine();
}
if (ParametersForLbjCode.currentParameters.debug) {
logger.info(wordsAdded + " words added");
}
brownclusters.wordToPathByResource.add(h);
brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i);
brownclusters.resources.add(pathsToClusterFiles.elementAt(i));
in.close();
}
}
} catch (InvalidPortException | InvalidEndpointException | DatastoreException | FileNotFoundException e) {
e.printStackTrace();
}
}
use of edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator in project cogcomp-nlp by CogComp.
the class TreeGazetteers method init.
/**
* init all the gazetters, mangle each term in a variety of ways.
*
* @param pathToDictionaries
* @param phrase_length the max length of the phrases we will consider.
* @throws IOException
*/
private void init(int phrase_length, String pathToDictionaries) throws IOException {
try {
ArrayList<String> filenames = new ArrayList<>();
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.5, false);
// We are not loading the resources from classpath anymore. Instead we are calling them programmatically
// InputStream stream = ResourceUtilities.loadResource(pathToDictionaries + "/gazetteers-list.txt");
InputStream stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(stream));
String line;
while ((line = br.readLine()) != null) filenames.add(line);
// init the dictionaries.
dictionaries = new ArrayList<>(filenames.size());
dictionariesIgnoreCase = new ArrayList<>(filenames.size());
GazetteerTree gaz = new GazetteerTree(phrase_length);
GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {
@Override
public String[] split(String line) {
String tmp = line.toLowerCase();
if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
return new String[0];
else {
// character tokenization for Chinese
if (ParametersForLbjCode.currentParameters.language == Language.Chinese) {
String[] chars = new String[line.length()];
for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
return chars;
} else
return normalize(line).split("[\\s]+");
}
}
@Override
public String normalize(String term) {
return term.toLowerCase();
}
});
// for each dictionary, compile each of the gaz trees for each phrase permutation.
for (String file : filenames) {
String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
}
gaz.trimToSize();
gazIC.trimToSize();
dictionaries.add(gaz);
dictionariesIgnoreCase.add(gazIC);
if (ParametersForLbjCode.currentParameters.debug) {
logger.info("found " + dictionaries.size() + " gazetteers");
}
} catch (InvalidPortException | InvalidEndpointException e) {
e.printStackTrace();
} catch (DatastoreException e) {
e.printStackTrace();
}
}
Aggregations