use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.
the class BrownClusters method init.
/**
* Initialze the brown cluster data. This is a singleton, so this process is sychronized and
* atomic with resprect to the <code>get()</code> method above.
* @param pathsToClusterFiles the files containing the data.
* @param thresholds
* @param isLowercaseBrownClusters
*/
public static void init(Vector<String> pathsToClusterFiles, Vector<Integer> thresholds, Vector<Boolean> isLowercaseBrownClusters) {
try {
Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false);
synchronized (INIT_SYNC) {
brownclusters = new BrownClusters();
brownclusters.isLowercaseBrownClustersByResource = new boolean[isLowercaseBrownClusters.size()];
brownclusters.wordToPathByResource = new ArrayList<>();
brownclusters.resources = new ArrayList<>();
for (int i = 0; i < pathsToClusterFiles.size(); i++) {
THashMap<String, String> h = new THashMap<>();
// We used to access the files as resources. Now we are accessing them programmatically.
// InFile in = new InFile(ResourceUtilities.loadResource(pathsToClusterFiles.elementAt(i)));
InputStream is = new FileInputStream(gazDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i));
InFile in = new InFile(is);
String line = in.readLine();
int wordsAdded = 0;
while (line != null) {
StringTokenizer st = new StringTokenizer(line);
String path = st.nextToken();
String word = st.nextToken();
int occ = Integer.parseInt(st.nextToken());
if (occ >= thresholds.elementAt(i)) {
h.put(word, path);
wordsAdded++;
}
line = in.readLine();
}
if (ParametersForLbjCode.currentParameters.debug) {
logger.info(wordsAdded + " words added");
}
brownclusters.wordToPathByResource.add(h);
brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i);
brownclusters.resources.add(pathsToClusterFiles.elementAt(i));
in.close();
}
}
} catch (InvalidPortException | InvalidEndpointException | DatastoreException | FileNotFoundException e) {
e.printStackTrace();
}
}
use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.
the class TitleTextNormalizer method init.
public static void init() {
if (!ParametersForLbjCode.currentParameters.normalizeTitleText)
return;
InFile in = new InFile(pathToBrownClusterForWordFrequencies);
String line = in.readLine();
lowercasedToNormalizedTokensMap = new HashMap<>();
HashMap<String, Integer> normalizedTokenCounts = new HashMap<>();
while (line != null) {
StringTokenizer st = new StringTokenizer(line);
String path = st.nextToken();
String word = st.nextToken();
int occ = Integer.parseInt(st.nextToken());
if (lowercasedToNormalizedTokensMap.containsKey(word.toLowerCase())) {
String normalizedWord = lowercasedToNormalizedTokensMap.get(word.toLowerCase());
int prevCount = normalizedTokenCounts.get(normalizedWord);
if (prevCount < occ) {
lowercasedToNormalizedTokensMap.put(word.toLowerCase(), word);
normalizedTokenCounts.put(word, occ);
}
} else {
lowercasedToNormalizedTokensMap.put(word.toLowerCase(), word);
normalizedTokenCounts.put(word, occ);
}
line = in.readLine();
}
}
use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.
the class WordEmbeddings method init.
/*
* For now, the parameter minWordAppearanceThres is not used, but I'm planning to use it like I
* was using the word appearance thresholds on Brown Clusters
*/
public static void init(Vector<String> filenames, Vector<Integer> embeddingDimensionality, Vector<Integer> minWordAppearanceThres, Vector<Boolean> isLowecasedEmbedding, Vector<Double> normalizationConstant, Vector<NormalizationMethod> methods) {
dimensionalitiesSum = 0;
dimensionalities = new Vector<>();
resources = new Vector<>();
embeddingByResource = new Vector<>();
isLowecasedEmbeddingByResource = new Vector<>();
for (int resourceId = 0; resourceId < filenames.size(); resourceId++) {
HashMap<String, double[]> embedding = new HashMap<>();
InFile in = new InFile(ResourceUtilities.loadResource(filenames.elementAt(resourceId)));
String line = in.readLine();
double maxAbsValueInAnyDimension = 0;
while (line != null) {
StringTokenizer st = new StringTokenizer(line, " ");
String token = st.nextToken();
Vector<String> v = new Vector<>();
while (st.hasMoreTokens()) v.addElement(st.nextToken());
if (v.size() != embeddingDimensionality.elementAt(resourceId))
throw new IllegalArgumentException("Warning: unexpected dimensionality of " + v.size() + " for token " + token);
double[] arr = new double[v.size()];
double maxInThisDimension = 0;
for (int i = 0; i < arr.length; i++) {
arr[i] = Double.parseDouble(v.elementAt(i));
if (maxAbsValueInAnyDimension < Math.abs(arr[i]))
maxAbsValueInAnyDimension = Math.abs(arr[i]);
if (maxInThisDimension < Math.abs(arr[i]))
maxInThisDimension = Math.abs(arr[i]);
}
if (maxInThisDimension > 0 && methods.elementAt(resourceId).equals(NormalizationMethod.INDEPENDENT))
for (int i = 0; i < arr.length; i++) arr[i] = arr[i] / (normalizationConstant.elementAt(resourceId) * maxInThisDimension);
embedding.put(token, arr);
line = in.readLine();
}
in.close();
if (maxAbsValueInAnyDimension > 0 && methods.elementAt(resourceId).equals(NormalizationMethod.OVERALL))
for (String s : embedding.keySet()) {
double[] arr = embedding.get(s);
for (int j = 0; j < arr.length; j++) arr[j] = arr[j] / (normalizationConstant.elementAt(resourceId) * maxAbsValueInAnyDimension);
}
embeddingByResource.addElement(embedding);
dimensionalitiesSum += embeddingDimensionality.elementAt(resourceId);
dimensionalities.addElement(embeddingDimensionality.elementAt(resourceId));
resources.addElement(filenames.elementAt(resourceId));
isLowecasedEmbeddingByResource.addElement(isLowecasedEmbedding.elementAt(resourceId));
}
}
use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.
the class DocumentCollection method addDocuments.
/*
* This code assumes each line in a file contains a new document
*/
public void addDocuments(String filename, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) {
InFile in = new InFile(filename);
Vector<String> words = in.readLineTokens(tokenizationDelimiters);
if ((discardFirstToken) && (words != null) && (words.size() > 0))
words.removeElementAt(0);
if (stops != null)
words = stops.filterStopWords(words);
while (words != null) {
if (words.size() >= 0)
docs.addElement(new Document(words, classID));
words = in.readLineTokens(tokenizationDelimiters);
if ((discardFirstToken) && (words != null) && (words.size() > 0))
words.removeElementAt(0);
if (stops != null)
words = stops.filterStopWords(words);
}
}
Aggregations