use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method writeVocabCache.
/**
* This method saves vocab cache to provided OutputStream.
* Please note: it saves only vocab content, so it's suitable mostly for BagOfWords/TF-IDF vectorizers
*
* @param vocabCache
* @param stream
* @throws UnsupportedEncodingException
*/
public static void writeVocabCache(@NonNull VocabCache<VocabWord> vocabCache, @NonNull OutputStream stream) throws IOException {
PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(stream, "UTF-8")));
for (int x = 0; x < vocabCache.numWords(); x++) {
VocabWord word = vocabCache.elementAtIndex(x);
writer.println(word.toJSON());
}
writer.flush();
writer.close();
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readVocabCache.
/**
* This method reads vocab cache from provided InputStream.
* Please note: it reads only vocab content, so it's suitable mostly for BagOfWords/TF-IDF vectorizers
*
* @param stream
* @return
* @throws IOException
*/
public static VocabCache<VocabWord> readVocabCache(@NonNull InputStream stream) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
AbstractCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
VocabWordFactory factory = new VocabWordFactory();
String line = "";
while ((line = reader.readLine()) != null) {
VocabWord word = factory.deserialize(line);
vocabCache.addToken(word);
vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
}
return vocabCache;
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class InMemoryLookupCache method importVocabulary.
@Override
public void importVocabulary(VocabCache<VocabWord> vocabCache) {
for (VocabWord word : vocabCache.vocabWords()) {
if (vocabs.containsKey(word.getLabel())) {
wordFrequencies.incrementCount(word.getLabel(), word.getElementFrequency());
} else {
tokens.put(word.getLabel(), word);
vocabs.put(word.getLabel(), word);
wordFrequencies.incrementCount(word.getLabel(), word.getElementFrequency());
}
totalWordOccurrences.addAndGet((long) word.getElementFrequency());
}
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class InMemoryLookupCache method putVocabWord.
/**
* @param word
*/
@Override
@Deprecated
public synchronized void putVocabWord(String word) {
if (word == null || word.isEmpty())
throw new IllegalArgumentException("Word can't be empty or null");
// STOP and UNK are not added as tokens
if (word.equals("STOP") || word.equals("UNK"))
return;
VocabWord token = tokenFor(word);
if (token == null)
throw new IllegalStateException("Word " + word + " not found as token in vocab");
int ind = token.getIndex();
addWordToIndex(ind, word);
if (!hasToken(word))
throw new IllegalStateException("Unable to add token " + word + " when not already a token");
vocabs.put(word, token);
wordIndex.add(word, token.getIndex());
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class InMemoryLookupCache method addWordToIndex.
/**
* @param index
* @param word
*/
@Override
public synchronized void addWordToIndex(int index, String word) {
if (word == null || word.isEmpty())
throw new IllegalArgumentException("Word can't be empty or null");
if (!tokens.containsKey(word)) {
VocabWord token = new VocabWord(1.0, word);
tokens.put(word, token);
wordFrequencies.incrementCount(word, 1.0);
}
/*
If we're speaking about adding any word to index directly, it means it's going to be vocab word, not token
*/
if (!vocabs.containsKey(word)) {
VocabWord vw = tokenFor(word);
vw.setIndex(index);
vocabs.put(word, vw);
vw.setIndex(index);
}
if (!wordFrequencies.containsKey(word))
wordFrequencies.incrementCount(word, 1);
wordIndex.add(word, index);
}
Aggregations