Examples with VocabWord - org.deeplearning4j.models.word2vec.VocabWord

Example 16 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class ParagraphVectors method inferVector.

/**
     * This method calculates inferred vector for given text
     *
     * @param text
     * @return
     */
public INDArray inferVector(String text, double learningRate, double minLearningRate, int iterations) {
    if (tokenizerFactory == null)
        throw new IllegalStateException("TokenizerFactory should be defined, prior to predict() call");
    if (this.vocab == null || this.vocab.numWords() == 0)
        reassignExistingModel();
    List<String> tokens = tokenizerFactory.create(text).getTokens();
    List<VocabWord> document = new ArrayList<>();
    for (String token : tokens) {
        if (vocab.containsWord(token)) {
            document.add(vocab.wordFor(token));
        }
    }
    if (document.isEmpty())
        throw new ND4JIllegalStateException("Text passed for inference has no matches in model vocabulary.");
    return inferVector(document, learningRate, minLearningRate, iterations);
}

Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException)

Example 17 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class BasicTransformerIterator method next.

@Override
public Sequence<VocabWord> next() {
    LabelledDocument document = iterator.nextDocument();
    if (document == null || document.getContent() == null)
        return new Sequence<>();
    Sequence<VocabWord> sequence = sentenceTransformer.transformToSequence(document.getContent());
    if (document.getLabels() != null)
        for (String label : document.getLabels()) {
            if (label != null && !label.isEmpty())
                sequence.addSequenceLabel(new VocabWord(1.0, label));
        }
    return sequence;
}

Also used : LabelledDocument(org.deeplearning4j.text.documentiterator.LabelledDocument) VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Example 18 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class BinaryCoOccurrenceReaderTest method testHasMoreObjects2.

@Test
public void testHasMoreObjects2() throws Exception {
    File tempFile = File.createTempFile("tmp", "tmp");
    tempFile.deleteOnExit();
    VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
    VocabWord word1 = new VocabWord(1.0, "human");
    VocabWord word2 = new VocabWord(2.0, "animal");
    VocabWord word3 = new VocabWord(3.0, "unknown");
    vocabCache.addToken(word1);
    vocabCache.addToken(word2);
    vocabCache.addToken(word3);
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    huffman.applyIndexes(vocabCache);
    BinaryCoOccurrenceWriter<VocabWord> writer = new BinaryCoOccurrenceWriter<>(tempFile);
    CoOccurrenceWeight<VocabWord> object1 = new CoOccurrenceWeight<>();
    object1.setElement1(word1);
    object1.setElement2(word2);
    object1.setWeight(3.14159265);
    writer.writeObject(object1);
    CoOccurrenceWeight<VocabWord> object2 = new CoOccurrenceWeight<>();
    object2.setElement1(word2);
    object2.setElement2(word3);
    object2.setWeight(0.197);
    writer.writeObject(object2);
    CoOccurrenceWeight<VocabWord> object3 = new CoOccurrenceWeight<>();
    object3.setElement1(word1);
    object3.setElement2(word3);
    object3.setWeight(0.001);
    writer.writeObject(object3);
    writer.finish();
    BinaryCoOccurrenceReader<VocabWord> reader = new BinaryCoOccurrenceReader<>(tempFile, vocabCache, null);
    CoOccurrenceWeight<VocabWord> r1 = reader.nextObject();
    log.info("Object received: " + r1);
    assertNotEquals(null, r1);
    r1 = reader.nextObject();
    log.info("Object received: " + r1);
    assertNotEquals(null, r1);
    r1 = reader.nextObject();
    log.info("Object received: " + r1);
    assertNotEquals(null, r1);
}

Also used : Huffman(org.deeplearning4j.models.word2vec.Huffman) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) File(java.io.File) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) Test(org.junit.Test)

Example 19 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsDM.

@Test
public void testParagraphVectorsDM() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(2).seed(119).epochs(3).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).useHierarchicSoftmax(true).sampling(0).workers(1).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    // testing DM inference now
    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
}

Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) DM(org.deeplearning4j.models.embeddings.learning.impl.sequence.DM) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) Test(org.junit.Test)

Example 20 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsWithWordVectorsModelling1.

@Test
public void testParagraphVectorsWithWordVectorsModelling1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    //        InMemoryLookupCache cache = new InMemoryLookupCache(false);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(3).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).sampling(0).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    /*
            We have few lines that contain pretty close words invloved.
            These sentences should be pretty close to each other in vector space
         */
    // line 3721: This is my way .
    // line 6348: This is my case .
    // line 9836: This is my house .
    // line 12493: This is my world .
    // line 16393: This is my work .
    // this is special sentence, that has nothing common with previous sentences
    // line 9853: We now have one .
    assertTrue(vec.hasWord("DOC_3720"));
    double similarityD = vec.similarity("day", "night");
    log.info("day/night similarity: " + similarityD);
    double similarityW = vec.similarity("way", "work");
    log.info("way/work similarity: " + similarityW);
    double similarityH = vec.similarity("house", "world");
    log.info("house/world similarity: " + similarityH);
    double similarityC = vec.similarity("case", "way");
    log.info("case/way similarity: " + similarityC);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.7d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //        assertTrue(similarity2 > 0.7d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity2 > 0.7d);
    // likelihood in this case should be significantly lower
    // however, since corpus is small, and weight initialization is random-based, sometimes this test CAN fail
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    double sim119 = vec.similarityToLabel("This is my case .", "DOC_6347");
    double sim120 = vec.similarityToLabel("This is my case .", "DOC_3720");
    log.info("1/2: " + sim119 + "/" + sim120);
//assertEquals(similarity3, sim119, 0.001);
}

Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) Test(org.junit.Test)

Aggregations

VocabWord (org.deeplearning4j.models.word2vec.VocabWord)110 Test (org.junit.Test)54 INDArray (org.nd4j.linalg.api.ndarray.INDArray)31 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)26 ClassPathResource (org.datavec.api.util.ClassPathResource)23 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)22 File (java.io.File)20 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)19 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)19 ArrayList (java.util.ArrayList)17 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)17 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)15 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)14 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)13 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)12 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)11 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)11 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)10