Examples with WordVectors - org.deeplearning4j.models.embeddings.wordvectors.WordVectors

Example 1 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testGoogleModelForInference.

@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false).trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors).negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}

Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DM(org.deeplearning4j.models.embeddings.learning.impl.sequence.DM) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class TestCnnSentenceDataSetIterator method testSentenceIterator.

@Test
public void testSentenceIterator() throws Exception {
    WordVectors w2v = WordVectorSerializer.readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile());
    int vectorSize = w2v.lookupTable().layerSize();
    //        Collection<String> words = w2v.lookupTable().getVocabCache().words();
    //        for(String s : words){
    //            System.out.println(s);
    //        }
    List<String> sentences = new ArrayList<>();
    //First word: all present
    sentences.add("these balance Database model");
    sentences.add("into same THISWORDDOESNTEXIST are");
    int maxLength = 4;
    List<String> s1 = Arrays.asList("these", "balance", "Database", "model");
    List<String> s2 = Arrays.asList("into", "same", "are");
    List<String> labelsForSentences = Arrays.asList("Positive", "Negative");
    //Order of labels: alphabetic. Positive -> [0,1]
    INDArray expLabels = Nd4j.create(new double[][] { { 0, 1 }, { 1, 0 } });
    boolean[] alongHeightVals = new boolean[] { true, false };
    for (boolean alongHeight : alongHeightVals) {
        INDArray expectedFeatures;
        if (alongHeight) {
            expectedFeatures = Nd4j.create(2, 1, maxLength, vectorSize);
        } else {
            expectedFeatures = Nd4j.create(2, 1, vectorSize, maxLength);
        }
        INDArray expectedFeatureMask = Nd4j.create(new double[][] { { 1, 1, 1, 1 }, { 1, 1, 1, 0 } });
        for (int i = 0; i < 4; i++) {
            if (alongHeight) {
                expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.point(i), NDArrayIndex.all()).assign(w2v.getWordVectorMatrix(s1.get(i)));
            } else {
                expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s1.get(i)));
            }
        }
        for (int i = 0; i < 3; i++) {
            if (alongHeight) {
                expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.point(i), NDArrayIndex.all()).assign(w2v.getWordVectorMatrix(s2.get(i)));
            } else {
                expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s2.get(i)));
            }
        }
        LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null);
        CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder().sentenceProvider(p).wordVectors(w2v).maxSentenceLength(256).minibatchSize(32).sentencesAlongHeight(alongHeight).build();
        //            System.out.println("alongHeight = " + alongHeight);
        DataSet ds = dsi.next();
        assertArrayEquals(expectedFeatures.shape(), ds.getFeatures().shape());
        assertEquals(expectedFeatures, ds.getFeatures());
        assertEquals(expLabels, ds.getLabels());
        assertEquals(expectedFeatureMask, ds.getFeaturesMaskArray());
        assertNull(ds.getLabelsMaskArray());
        INDArray s1F = dsi.loadSingleSentence(sentences.get(0));
        INDArray s2F = dsi.loadSingleSentence(sentences.get(1));
        INDArray sub1 = ds.getFeatures().get(NDArrayIndex.interval(0, 0, true), NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.all());
        INDArray sub2;
        if (alongHeight) {
            sub2 = ds.getFeatures().get(NDArrayIndex.interval(1, 1, true), NDArrayIndex.all(), NDArrayIndex.interval(0, 3), NDArrayIndex.all());
        } else {
            sub2 = ds.getFeatures().get(NDArrayIndex.interval(1, 1, true), NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.interval(0, 3));
        }
        assertArrayEquals(sub1.shape(), s1F.shape());
        assertArrayEquals(sub2.shape(), s2F.shape());
        assertEquals(sub1, s1F);
        assertEquals(sub2, s2F);
    }
}

Also used : DataSet(org.nd4j.linalg.dataset.api.DataSet) ArrayList(java.util.ArrayList) CollectionLabeledSentenceProvider(org.deeplearning4j.iterator.provider.CollectionLabeledSentenceProvider) ClassPathResource(org.datavec.api.util.ClassPathResource) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CollectionLabeledSentenceProvider(org.deeplearning4j.iterator.provider.CollectionLabeledSentenceProvider) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) Test(org.junit.Test)

Example 3 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class UITest method testPosting.

@Test
public void testPosting() throws Exception {
    //        File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).epochs(1).layerSize(20).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    vec.fit();
    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();
    WordVectorSerializer.writeWordVectors(vec, tempFile);
    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
    //Initialize
    UIServer.getInstance();
    UiConnectionInfo uiConnectionInfo = new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build();
    BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2).setMaxIter(10).build();
    vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo);
    Thread.sleep(100000);
}

Also used : BarnesHutTsne(org.deeplearning4j.plot.BarnesHutTsne) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) UiConnectionInfo(org.deeplearning4j.ui.UiConnectionInfo) ArrayList(java.util.ArrayList) ClassPathResource(org.deeplearning4j.ui.standalone.ClassPathResource) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Test(org.junit.Test)

Example 4 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testWriteWordVectorsFromWord2Vec.

@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.loadGoogleModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);
    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}

Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with WordVectors

use of org.deeplearning4j.models.embeddings.wordvectors.WordVectors in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testLoaderBinary.

@Test
public void testLoaderBinary() throws IOException {
    WordVectors vec = WordVectorSerializer.loadGoogleModel(binaryFile, true);
    assertEquals(vec.vocab().numWords(), 30);
    assertTrue(vec.vocab().hasToken("Morgan_Freeman"));
    assertTrue(vec.vocab().hasToken("JA_Montalbano"));
    double[] wordVector1 = vec.getWordVector("Morgan_Freeman");
    double[] wordVector2 = vec.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}

Also used : WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) Test(org.junit.Test)

Aggregations

WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)26 Test (org.junit.Test)26 File (java.io.File)15 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 ClassPathResource (org.datavec.api.util.ClassPathResource)10 Ignore (org.junit.Ignore)9 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)6 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)6 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)6 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)4 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)4 ArrayList (java.util.ArrayList)3 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)3 VocabCache (org.deeplearning4j.models.word2vec.wordstore.VocabCache)3 InMemoryLookupCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)3 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)3 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)2 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)2 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1