Search in sources :

Example 16 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsWithWordVectorsModelling1.

@Test
public void testParagraphVectorsWithWordVectorsModelling1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    //        InMemoryLookupCache cache = new InMemoryLookupCache(false);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(3).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).sampling(0).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    /*
            We have few lines that contain pretty close words invloved.
            These sentences should be pretty close to each other in vector space
         */
    // line 3721: This is my way .
    // line 6348: This is my case .
    // line 9836: This is my house .
    // line 12493: This is my world .
    // line 16393: This is my work .
    // this is special sentence, that has nothing common with previous sentences
    // line 9853: We now have one .
    assertTrue(vec.hasWord("DOC_3720"));
    double similarityD = vec.similarity("day", "night");
    log.info("day/night similarity: " + similarityD);
    double similarityW = vec.similarity("way", "work");
    log.info("way/work similarity: " + similarityW);
    double similarityH = vec.similarity("house", "world");
    log.info("house/world similarity: " + similarityH);
    double similarityC = vec.similarity("case", "way");
    log.info("case/way similarity: " + similarityC);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.7d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //        assertTrue(similarity2 > 0.7d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity2 > 0.7d);
    // likelihood in this case should be significantly lower
    // however, since corpus is small, and weight initialization is random-based, sometimes this test CAN fail
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    double sim119 = vec.similarityToLabel("This is my case .", "DOC_6347");
    double sim120 = vec.similarityToLabel("This is my case .", "DOC_3720");
    log.info("1/2: " + sim119 + "/" + sim120);
//assertEquals(similarity3, sim119, 0.001);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) Test(org.junit.Test)

Example 17 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsReducedLabels1.

/**
     * This test is not indicative.
     * there's no need in this test within travis, use it manually only for problems detection
     *
     * @throws Exception
     */
@Test
@Ignore
public void testParagraphVectorsReducedLabels1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/labeled");
    File file = resource.getFile();
    LabelAwareIterator iter = new FileLabelAwareIterator.Builder().addSourceFolder(file).build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    /**
         * Please note: text corpus is REALLY small, and some kind of "results" could be received with HIGH epochs number, like 30.
         * But there's no reason to keep at that high
         */
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).epochs(3).layerSize(100).stopWords(new ArrayList<String>()).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    vec.fit();
    //WordVectorSerializer.writeWordVectors(vec, "vectors.txt");
    INDArray w1 = vec.lookupTable().vector("I");
    INDArray w2 = vec.lookupTable().vector("am");
    INDArray w3 = vec.lookupTable().vector("sad.");
    INDArray words = Nd4j.create(3, vec.lookupTable().layerSize());
    words.putRow(0, w1);
    words.putRow(1, w2);
    words.putRow(2, w3);
    INDArray mean = words.isMatrix() ? words.mean(0) : words;
    log.info("Mean" + Arrays.toString(mean.dup().data().asDouble()));
    log.info("Array" + Arrays.toString(vec.lookupTable().vector("negative").dup().data().asDouble()));
    double simN = Transforms.cosineSim(mean, vec.lookupTable().vector("negative"));
    log.info("Similarity negative: " + simN);
    double simP = Transforms.cosineSim(mean, vec.lookupTable().vector("neutral"));
    log.info("Similarity neutral: " + simP);
    double simV = Transforms.cosineSim(mean, vec.lookupTable().vector("positive"));
    log.info("Similarity positive: " + simV);
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LabelAwareIterator(org.deeplearning4j.text.documentiterator.LabelAwareIterator) FileLabelAwareIterator(org.deeplearning4j.text.documentiterator.FileLabelAwareIterator) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 18 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class ParallelTransformerIteratorTest method testSpeedComparison1.

@Test
public void testSpeedComparison1() throws Exception {
    SentenceIterator iterator = new MutipleEpochsSentenceIterator(new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile()), 25);
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(false).tokenizerFactory(factory).build();
    Iterator<Sequence<VocabWord>> iter = transformer.iterator();
    int cnt = 0;
    long time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    long time2 = System.currentTimeMillis();
    log.info("Single-threaded time: {} ms", time2 - time1);
    iterator.reset();
    transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true).tokenizerFactory(factory).build();
    iter = transformer.iterator();
    time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    time2 = System.currentTimeMillis();
    log.info("Multi-threaded time: {} ms", time2 - time1);
    SentenceIterator baseIterator = iterator;
    baseIterator.reset();
    LabelAwareIterator lai = new BasicLabelAwareIterator.Builder(new MutipleEpochsSentenceIterator(new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile()), 25)).build();
    transformer = new SentenceTransformer.Builder().iterator(lai).allowMultithreading(false).tokenizerFactory(factory).build();
    iter = transformer.iterator();
    time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    time2 = System.currentTimeMillis();
    log.info("Prefetched Single-threaded time: {} ms", time2 - time1);
    lai.reset();
    transformer = new SentenceTransformer.Builder().iterator(lai).allowMultithreading(true).tokenizerFactory(factory).build();
    iter = transformer.iterator();
    time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    time2 = System.currentTimeMillis();
    log.info("Prefetched Multi-threaded time: {} ms", time2 - time1);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) MutipleEpochsSentenceIterator(org.deeplearning4j.text.sentenceiterator.MutipleEpochsSentenceIterator) BasicLabelAwareIterator(org.deeplearning4j.text.documentiterator.BasicLabelAwareIterator) AsyncLabelAwareIterator(org.deeplearning4j.text.documentiterator.AsyncLabelAwareIterator) BasicLabelAwareIterator(org.deeplearning4j.text.documentiterator.BasicLabelAwareIterator) LabelAwareIterator(org.deeplearning4j.text.documentiterator.LabelAwareIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) Sequence(org.deeplearning4j.models.sequencevectors.sequence.Sequence) PrefetchingSentenceIterator(org.deeplearning4j.text.sentenceiterator.PrefetchingSentenceIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) MutipleEpochsSentenceIterator(org.deeplearning4j.text.sentenceiterator.MutipleEpochsSentenceIterator) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 19 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class VocabConstructorTest method testBuildJointVocabulary2.

@Test
public void testBuildJointVocabulary2() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    VocabCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 5).useAdaGrad(false).setTargetVocabCache(cache).build();
    constructor.buildJointVocabulary(false, true);
    //        assertFalse(cache.hasToken("including"));
    assertEquals(242, cache.numWords());
    assertEquals("i", cache.wordAtIndex(1));
    assertEquals("it", cache.wordAtIndex(0));
    assertEquals(634303, cache.totalWordOccurrences());
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) File(java.io.File) Test(org.junit.Test)

Example 20 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class VocabConstructorTest method testBuildJointVocabulary1.

@Test
public void testBuildJointVocabulary1() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    VocabCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();
    /*
            And we pack that transformer into AbstractSequenceIterator
         */
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 0).useAdaGrad(false).setTargetVocabCache(cache).build();
    constructor.buildJointVocabulary(true, false);
    assertEquals(244, cache.numWords());
    assertEquals(0, cache.totalWordOccurrences());
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) File(java.io.File) Test(org.junit.Test)

Aggregations

ClassPathResource (org.datavec.api.util.ClassPathResource)72 Test (org.junit.Test)63 File (java.io.File)45 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)28 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)27 INDArray (org.nd4j.linalg.api.ndarray.INDArray)24 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)23 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)23 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)20 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)11 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)10 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 ArrayList (java.util.ArrayList)9 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)8 DataSet (org.nd4j.linalg.dataset.DataSet)8 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 InputStream (java.io.InputStream)6