use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface in project cogcomp-nlp by CogComp.
the class TreeGazetteers method init.
/**
* init all the gazetters, mangle each term in a variety of ways.
*
* @param pathToDictionaries
* @param phrase_length the max length of the phrases we will consider.
* @throws IOException
*/
private void init(int phrase_length, String pathToDictionaries) throws IOException {
try {
ArrayList<String> filenames = new ArrayList<>();
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.5, false);
// We are not loading the resources from classpath anymore. Instead we are calling them programmatically
// InputStream stream = ResourceUtilities.loadResource(pathToDictionaries + "/gazetteers-list.txt");
InputStream stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(stream));
String line;
while ((line = br.readLine()) != null) filenames.add(line);
// init the dictionaries.
dictionaries = new ArrayList<>(filenames.size());
dictionariesIgnoreCase = new ArrayList<>(filenames.size());
GazetteerTree gaz = new GazetteerTree(phrase_length);
GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {
@Override
public String[] split(String line) {
String tmp = line.toLowerCase();
if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
return new String[0];
else {
// character tokenization for Chinese
if (ParametersForLbjCode.currentParameters.language == Language.Chinese) {
String[] chars = new String[line.length()];
for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
return chars;
} else
return normalize(line).split("[\\s]+");
}
}
@Override
public String normalize(String term) {
return term.toLowerCase();
}
});
// for each dictionary, compile each of the gaz trees for each phrase permutation.
for (String file : filenames) {
String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
}
gaz.trimToSize();
gazIC.trimToSize();
dictionaries.add(gaz);
dictionariesIgnoreCase.add(gazIC);
if (ParametersForLbjCode.currentParameters.debug) {
logger.info("found " + dictionaries.size() + " gazetteers");
}
} catch (InvalidPortException | InvalidEndpointException e) {
e.printStackTrace();
} catch (DatastoreException e) {
e.printStackTrace();
}
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface in project cogcomp-nlp by CogComp.
the class TreeGazetteers method init.
/**
* init all the gazetters, mangle each term in a variety of ways.
*
* @param pathToDictionaries the path to the gazetteers.
* @param phrase_length the max length of the phrases we will consider.
* @throws IOException
*/
private void init(int phrase_length, String pathToDictionaries, final Language language) throws IOException {
try {
// check the local file system for it.
File gazDirectory = new File(pathToDictionaries);
String pathToLists = gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt";
InputStream stream = ResourceUtilities.loadResource(pathToLists);
if (stream == null) {
logger.info("Loading gazetteers from \"" + pathToLists + "\" using the Minio cache.");
// not in file system or classpath, try Minio.
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.6, false);
stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
} else {
logger.info("Loading gazetteers from \"" + pathToLists + "\" from the local file system.");
}
BufferedReader br = new BufferedReader(new InputStreamReader(stream));
String line;
ArrayList<String> filenames = new ArrayList<>();
while ((line = br.readLine()) != null) filenames.add(line);
// init the dictionaries.
dictionaries = new ArrayList<>(filenames.size());
dictionariesIgnoreCase = new ArrayList<>(filenames.size());
GazetteerTree gaz = new GazetteerTree(phrase_length, new StringSplitterInterface() {
@Override
public String[] split(String line) {
// character tokenization for Chinese
if (language == Language.Chinese) {
String[] chars = new String[line.length()];
for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
return chars;
} else
return line.split("[\\s]+");
}
@Override
public final String normalize(String term) {
return term;
}
});
GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {
@Override
public String[] split(String line) {
String tmp = line.toLowerCase();
if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
return new String[0];
else {
// character tokenization for Chinese
if (language == Language.Chinese) {
String[] chars = new String[line.length()];
for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
return chars;
} else
return normalize(line).split("[\\s]+");
}
}
@Override
public String normalize(String term) {
return term.toLowerCase();
}
});
// for each dictionary, compile each of the gaz trees for each phrase permutation.
for (String file : filenames) {
String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
}
gaz.trimToSize();
gazIC.trimToSize();
dictionaries.add(gaz);
dictionariesIgnoreCase.add(gazIC);
logger.info("Gazetteers from \"" + pathToLists + "\" are loaded.");
} catch (InvalidPortException | InvalidEndpointException e) {
e.printStackTrace();
} catch (DatastoreException e) {
e.printStackTrace();
}
}
Aggregations