Search in sources :

Example 1 with InMemoryLookupCache

use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method writeTsneFormat.

/**
     * Write the tsne format
     *
     * @param vec
     *            the word vectors to use for labeling
     * @param tsne
     *            the tsne array to write
     * @param csv
     *            the file to use
     * @throws Exception
     */
public static void writeTsneFormat(Glove vec, INDArray tsne, File csv) throws Exception {
    BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), "UTF-8"));
    int words = 0;
    InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
    for (String word : vec.vocab().words()) {
        if (word == null) {
            continue;
        }
        StringBuilder sb = new StringBuilder();
        INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
        for (int j = 0; j < wordVector.length(); j++) {
            sb.append(wordVector.getDouble(j));
            if (j < wordVector.length() - 1) {
                sb.append(",");
            }
        }
        sb.append(",");
        sb.append(word.replaceAll(" ", whitespaceReplacement));
        sb.append(" ");
        sb.append("\n");
        write.write(sb.toString());
    }
    log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
    write.flush();
    write.close();
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)

Example 2 with InMemoryLookupCache

use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readTextModel.

/**
     * @param modelFile
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     * @throws NumberFormatException
     */
private static Word2Vec readTextModel(File modelFile) throws IOException, NumberFormatException {
    InMemoryLookupTable lookupTable;
    VocabCache cache;
    INDArray syn0;
    Word2Vec ret = new Word2Vec();
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile), "UTF-8"))) {
        String line = reader.readLine();
        String[] initial = line.split(" ");
        int words = Integer.parseInt(initial[0]);
        int layerSize = Integer.parseInt(initial[1]);
        syn0 = Nd4j.create(words, layerSize);
        cache = new InMemoryLookupCache(false);
        int currLine = 0;
        while ((line = reader.readLine()) != null) {
            String[] split = line.split(" ");
            assert split.length == layerSize + 1;
            String word = split[0].replaceAll(whitespaceReplacement, " ");
            float[] vector = new float[split.length - 1];
            for (int i = 1; i < split.length; i++) {
                vector[i - 1] = Float.parseFloat(split[i]);
            }
            syn0.putRow(currLine, Nd4j.create(vector));
            cache.addWordToIndex(cache.numWords(), word);
            cache.addToken(new VocabWord(1, word));
            cache.putVocabWord(word);
            currLine++;
        }
        lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache).vectorLength(layerSize).build();
        lookupTable.setSyn0(syn0);
        ret.setVocab(cache);
        ret.setLookupTable(lookupTable);
    }
    return ret;
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) GZIPInputStream(java.util.zip.GZIPInputStream) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec)

Example 3 with InMemoryLookupCache

use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method writeTsneFormat.

/**
     * Write the tsne format
     *
     * @param vec
     *            the word vectors to use for labeling
     * @param tsne
     *            the tsne array to write
     * @param csv
     *            the file to use
     * @throws Exception
     */
public static void writeTsneFormat(Word2Vec vec, INDArray tsne, File csv) throws Exception {
    BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), "UTF-8"));
    int words = 0;
    InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
    for (String word : vec.vocab().words()) {
        if (word == null) {
            continue;
        }
        StringBuilder sb = new StringBuilder();
        INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
        for (int j = 0; j < wordVector.length(); j++) {
            sb.append(wordVector.getDouble(j));
            if (j < wordVector.length() - 1) {
                sb.append(",");
            }
        }
        sb.append(",");
        sb.append(word.replaceAll(" ", whitespaceReplacement));
        sb.append(" ");
        sb.append("\n");
        write.write(sb.toString());
    }
    log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
    write.flush();
    write.close();
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)

Example 4 with InMemoryLookupCache

use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testFromTableAndVocab.

@Test
@Ignore
public void testFromTableAndVocab() throws IOException {
    WordVectors vec = WordVectorSerializer.loadGoogleModel(textFile, false);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
Also used : InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with InMemoryLookupCache

use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.

the class VocabularyHolderTest method testConstructor.

@Test
public void testConstructor() throws Exception {
    InMemoryLookupCache cache = new InMemoryLookupCache(true);
    VocabularyHolder holder = new VocabularyHolder(cache, false);
    // no more UNK token here
    assertEquals(0, holder.numWords());
}
Also used : InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) Test(org.junit.Test)

Aggregations

InMemoryLookupCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)11 Test (org.junit.Test)7 INDArray (org.nd4j.linalg.api.ndarray.INDArray)6 File (java.io.File)4 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)4 ClassPathResource (org.datavec.api.util.ClassPathResource)3 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)3 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)3 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)3 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)3 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)3 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)3 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)3 WeightLookupTable (org.deeplearning4j.models.embeddings.WeightLookupTable)2 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)2 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)2 Ignore (org.junit.Ignore)2 FileOutputStream (java.io.FileOutputStream)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 StaticWord2Vec (org.deeplearning4j.models.word2vec.StaticWord2Vec)1