Search in sources :

Example 6 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testUnifiedLoaderText.

/**
     * This method tests CSV file loading via unified loader
     *
     * @throws Exception
     */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);
    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");
    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) Test(org.junit.Test)

Example 7 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testFromTableAndVocab.

@Test
@Ignore
public void testFromTableAndVocab() throws IOException {
    WordVectors vec = WordVectorSerializer.loadGoogleModel(textFile, false);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
Also used : InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 8 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testStaticLoaderGoogleModel.

/**
     * This method here is only to test real google model few gigabytes worth
     * Keep it ignored, since it requirs full google model being present in system, which is 1.6gb compressed
     *
     * @throws Exception
     */
@Test
@Ignore
public void testStaticLoaderGoogleModel() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
    long time1 = System.currentTimeMillis();
    WordVectors vectors = WordVectorSerializer.loadStaticModel(new File("C:\\Users\\raver\\develop\\GoogleNews-vectors-negative300.bin.gz"));
    long time2 = System.currentTimeMillis();
    logger.info("Loading time: {} ms", (time2 - time1));
}
Also used : WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 9 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testStaticLoaderText.

/**
     * This method tests CSV file loading as static model
     *
     * @throws Exception
     */
@Test
public void testStaticLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(textFile);
    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");
    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) Test(org.junit.Test)

Example 10 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testLoaderTextSmall.

@Test
@Ignore
public void testLoaderTextSmall() throws Exception {
    INDArray vec = Nd4j.create(new double[] { 0.002001, 0.002210, -0.001915, -0.001639, 0.000683, 0.001511, 0.000470, 0.000106, -0.001802, 0.001109, -0.002178, 0.000625, -0.000376, -0.000479, -0.001658, -0.000941, 0.001290, 0.001513, 0.001485, 0.000799, 0.000772, -0.001901, -0.002048, 0.002485, 0.001901, 0.001545, -0.000302, 0.002008, -0.000247, 0.000367, -0.000075, -0.001492, 0.000656, -0.000669, -0.001913, 0.002377, 0.002190, -0.000548, -0.000113, 0.000255, -0.001819, -0.002004, 0.002277, 0.000032, -0.001291, -0.001521, -0.001538, 0.000848, 0.000101, 0.000666, -0.002107, -0.001904, -0.000065, 0.000572, 0.001275, -0.001585, 0.002040, 0.000463, 0.000560, -0.000304, 0.001493, -0.001144, -0.001049, 0.001079, -0.000377, 0.000515, 0.000902, -0.002044, -0.000992, 0.001457, 0.002116, 0.001966, -0.001523, -0.001054, -0.000455, 0.001001, -0.001894, 0.001499, 0.001394, -0.000799, -0.000776, -0.001119, 0.002114, 0.001956, -0.000590, 0.002107, 0.002410, 0.000908, 0.002491, -0.001556, -0.000766, -0.001054, -0.001454, 0.001407, 0.000790, 0.000212, -0.001097, 0.000762, 0.001530, 0.000097, 0.001140, -0.002476, 0.002157, 0.000240, -0.000916, -0.001042, -0.000374, -0.001468, -0.002185, -0.001419, 0.002139, -0.000885, -0.001340, 0.001159, -0.000852, 0.002378, -0.000802, -0.002294, 0.001358, -0.000037, -0.001744, 0.000488, 0.000721, -0.000241, 0.000912, -0.001979, 0.000441, 0.000908, -0.001505, 0.000071, -0.000030, -0.001200, -0.001416, -0.002347, 0.000011, 0.000076, 0.000005, -0.001967, -0.002481, -0.002373, -0.002163, -0.000274, 0.000696, 0.000592, -0.001591, 0.002499, -0.001006, -0.000637, -0.000702, 0.002366, -0.001882, 0.000581, -0.000668, 0.001594, 0.000020, 0.002135, -0.001410, -0.001303, -0.002096, -0.001833, -0.001600, -0.001557, 0.001222, -0.000933, 0.001340, 0.001845, 0.000678, 0.001475, 0.001238, 0.001170, -0.001775, -0.001717, -0.001828, -0.000066, 0.002065, -0.001368, -0.001530, -0.002098, 0.001653, -0.002089, -0.000290, 0.001089, -0.002309, -0.002239, 0.000721, 0.001762, 0.002132, 0.001073, 0.001581, -0.001564, -0.001820, 0.001987, -0.001382, 0.000877, 0.000287, 0.000895, -0.000591, 0.000099, -0.000843, -0.000563 });
    String w1 = "database";
    String w2 = "DBMS";
    WordVectors vecModel = WordVectorSerializer.loadGoogleModel(new ClassPathResource("word2vec/googleload/sample_vec.txt").getFile(), false, true);
    WordVectors vectorsBinary = WordVectorSerializer.loadGoogleModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile(), true, true);
    INDArray textWeights = vecModel.lookupTable().getWeights();
    INDArray binaryWeights = vectorsBinary.lookupTable().getWeights();
    Collection<String> nearest = vecModel.wordsNearest("database", 10);
    Collection<String> nearestBinary = vectorsBinary.wordsNearest("database", 10);
    System.out.println(nearestBinary);
    assertEquals(vecModel.similarity("DBMS", "DBMS's"), vectorsBinary.similarity("DBMS", "DBMS's"), 1e-1);
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) ClassPathResource(org.datavec.api.util.ClassPathResource) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)26 Test (org.junit.Test)26 File (java.io.File)15 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 ClassPathResource (org.datavec.api.util.ClassPathResource)10 Ignore (org.junit.Ignore)9 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)6 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)6 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)6 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)4 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)4 ArrayList (java.util.ArrayList)3 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)3 VocabCache (org.deeplearning4j.models.word2vec.wordstore.VocabCache)3 InMemoryLookupCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)3 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)3 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)2 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)2 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1