Search in sources :

Example 21 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class VocabWordFactoryTest method testSerialize.

@Test
public void testSerialize() throws Exception {
    VocabWord word = new VocabWord(1, "word");
    AbstractElementFactory<VocabWord> factory = new AbstractElementFactory<>(VocabWord.class);
    System.out.println("VocabWord JSON: " + factory.serialize(word));
    VocabWord word2 = factory.deserialize(factory.serialize(word));
    assertEquals(word, word2);
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) Test(org.junit.Test)

Example 22 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class GraphTransformerTest method setUp.

@Before
public void setUp() throws Exception {
    if (graph == null) {
        graph = new Graph<>(10, false, new AbstractVertexFactory<VocabWord>());
        for (int i = 0; i < 10; i++) {
            graph.getVertex(i).setValue(new VocabWord(i, String.valueOf(i)));
            int x = i + 3;
            if (x >= 10)
                x = 0;
            graph.addEdge(i, x, 1.0, false);
        }
    }
}
Also used : AbstractVertexFactory(org.deeplearning4j.models.sequencevectors.graph.vertex.AbstractVertexFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) Before(org.junit.Before)

Example 23 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class ParallelTransformerIteratorTest method testSpeedComparison1.

@Test
public void testSpeedComparison1() throws Exception {
    SentenceIterator iterator = new MutipleEpochsSentenceIterator(new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile()), 25);
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(false).tokenizerFactory(factory).build();
    Iterator<Sequence<VocabWord>> iter = transformer.iterator();
    int cnt = 0;
    long time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    long time2 = System.currentTimeMillis();
    log.info("Single-threaded time: {} ms", time2 - time1);
    iterator.reset();
    transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true).tokenizerFactory(factory).build();
    iter = transformer.iterator();
    time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    time2 = System.currentTimeMillis();
    log.info("Multi-threaded time: {} ms", time2 - time1);
    SentenceIterator baseIterator = iterator;
    baseIterator.reset();
    LabelAwareIterator lai = new BasicLabelAwareIterator.Builder(new MutipleEpochsSentenceIterator(new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile()), 25)).build();
    transformer = new SentenceTransformer.Builder().iterator(lai).allowMultithreading(false).tokenizerFactory(factory).build();
    iter = transformer.iterator();
    time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    time2 = System.currentTimeMillis();
    log.info("Prefetched Single-threaded time: {} ms", time2 - time1);
    lai.reset();
    transformer = new SentenceTransformer.Builder().iterator(lai).allowMultithreading(true).tokenizerFactory(factory).build();
    iter = transformer.iterator();
    time1 = System.currentTimeMillis();
    while (iter.hasNext()) {
        Sequence<VocabWord> sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    time2 = System.currentTimeMillis();
    log.info("Prefetched Multi-threaded time: {} ms", time2 - time1);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) MutipleEpochsSentenceIterator(org.deeplearning4j.text.sentenceiterator.MutipleEpochsSentenceIterator) BasicLabelAwareIterator(org.deeplearning4j.text.documentiterator.BasicLabelAwareIterator) AsyncLabelAwareIterator(org.deeplearning4j.text.documentiterator.AsyncLabelAwareIterator) BasicLabelAwareIterator(org.deeplearning4j.text.documentiterator.BasicLabelAwareIterator) LabelAwareIterator(org.deeplearning4j.text.documentiterator.LabelAwareIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) Sequence(org.deeplearning4j.models.sequencevectors.sequence.Sequence) PrefetchingSentenceIterator(org.deeplearning4j.text.sentenceiterator.PrefetchingSentenceIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) MutipleEpochsSentenceIterator(org.deeplearning4j.text.sentenceiterator.MutipleEpochsSentenceIterator) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 24 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class VocabConstructorTest method testBuildJointVocabulary2.

@Test
public void testBuildJointVocabulary2() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    VocabCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 5).useAdaGrad(false).setTargetVocabCache(cache).build();
    constructor.buildJointVocabulary(false, true);
    //        assertFalse(cache.hasToken("including"));
    assertEquals(242, cache.numWords());
    assertEquals("i", cache.wordAtIndex(1));
    assertEquals("it", cache.wordAtIndex(0));
    assertEquals(634303, cache.totalWordOccurrences());
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) File(java.io.File) Test(org.junit.Test)

Example 25 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class VocabConstructorTest method testBuildJointVocabulary1.

@Test
public void testBuildJointVocabulary1() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    VocabCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();
    /*
            And we pack that transformer into AbstractSequenceIterator
         */
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 0).useAdaGrad(false).setTargetVocabCache(cache).build();
    constructor.buildJointVocabulary(true, false);
    assertEquals(244, cache.numWords());
    assertEquals(0, cache.totalWordOccurrences());
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) File(java.io.File) Test(org.junit.Test)

Aggregations

VocabWord (org.deeplearning4j.models.word2vec.VocabWord)110 Test (org.junit.Test)54 INDArray (org.nd4j.linalg.api.ndarray.INDArray)31 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)26 ClassPathResource (org.datavec.api.util.ClassPathResource)23 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)22 File (java.io.File)20 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)19 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)19 ArrayList (java.util.ArrayList)17 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)17 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)15 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)14 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)13 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)12 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)11 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)11 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)10