use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method writeTsneFormat.
/**
* Write the tsne format
*
* @param vec
* the word vectors to use for labeling
* @param tsne
* the tsne array to write
* @param csv
* the file to use
* @throws Exception
*/
public static void writeTsneFormat(Glove vec, INDArray tsne, File csv) throws Exception {
BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), "UTF-8"));
int words = 0;
InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
for (String word : vec.vocab().words()) {
if (word == null) {
continue;
}
StringBuilder sb = new StringBuilder();
INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
for (int j = 0; j < wordVector.length(); j++) {
sb.append(wordVector.getDouble(j));
if (j < wordVector.length() - 1) {
sb.append(",");
}
}
sb.append(",");
sb.append(word.replaceAll(" ", whitespaceReplacement));
sb.append(" ");
sb.append("\n");
write.write(sb.toString());
}
log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
write.flush();
write.close();
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readTextModel.
/**
* @param modelFile
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws NumberFormatException
*/
private static Word2Vec readTextModel(File modelFile) throws IOException, NumberFormatException {
InMemoryLookupTable lookupTable;
VocabCache cache;
INDArray syn0;
Word2Vec ret = new Word2Vec();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile), "UTF-8"))) {
String line = reader.readLine();
String[] initial = line.split(" ");
int words = Integer.parseInt(initial[0]);
int layerSize = Integer.parseInt(initial[1]);
syn0 = Nd4j.create(words, layerSize);
cache = new InMemoryLookupCache(false);
int currLine = 0;
while ((line = reader.readLine()) != null) {
String[] split = line.split(" ");
assert split.length == layerSize + 1;
String word = split[0].replaceAll(whitespaceReplacement, " ");
float[] vector = new float[split.length - 1];
for (int i = 1; i < split.length; i++) {
vector[i - 1] = Float.parseFloat(split[i]);
}
syn0.putRow(currLine, Nd4j.create(vector));
cache.addWordToIndex(cache.numWords(), word);
cache.addToken(new VocabWord(1, word));
cache.putVocabWord(word);
currLine++;
}
lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache).vectorLength(layerSize).build();
lookupTable.setSyn0(syn0);
ret.setVocab(cache);
ret.setLookupTable(lookupTable);
}
return ret;
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method writeTsneFormat.
/**
* Write the tsne format
*
* @param vec
* the word vectors to use for labeling
* @param tsne
* the tsne array to write
* @param csv
* the file to use
* @throws Exception
*/
public static void writeTsneFormat(Word2Vec vec, INDArray tsne, File csv) throws Exception {
BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), "UTF-8"));
int words = 0;
InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
for (String word : vec.vocab().words()) {
if (word == null) {
continue;
}
StringBuilder sb = new StringBuilder();
INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
for (int j = 0; j < wordVector.length(); j++) {
sb.append(wordVector.getDouble(j));
if (j < wordVector.length() - 1) {
sb.append(",");
}
}
sb.append(",");
sb.append(word.replaceAll(" ", whitespaceReplacement));
sb.append(" ");
sb.append("\n");
write.write(sb.toString());
}
log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
write.flush();
write.close();
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testFromTableAndVocab.
@Test
@Ignore
public void testFromTableAndVocab() throws IOException {
WordVectors vec = WordVectorSerializer.loadGoogleModel(textFile, false);
InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
assertTrue(wordVector1.length == 300);
assertTrue(wordVector2.length == 300);
assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.
the class VocabularyHolderTest method testConstructor.
@Test
public void testConstructor() throws Exception {
InMemoryLookupCache cache = new InMemoryLookupCache(true);
VocabularyHolder holder = new VocabularyHolder(cache, false);
// no more UNK token here
assertEquals(0, holder.numWords());
}
Aggregations