Search in sources :

Example 1 with CBOW

use of org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW in project deeplearning4j by deeplearning4j.

the class Word2VecTests method testWord2VecCBOW.

@Test
public void testWord2VecCBOW() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
    vec.fit();
    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));
    //   assertEquals(10, lst.size());
    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);
    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) CBOW(org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW) Test(org.junit.Test)

Example 2 with CBOW

use of org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW in project deeplearning4j by deeplearning4j.

the class PerformanceTests method testWord2VecCBOWBig.

@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");
    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).allowParallelTokenization(true).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
    long time1 = System.currentTimeMillis();
    vec.fit();
    long time2 = System.currentTimeMillis();
    log.info("Total execution time: {}", (time2 - time1));
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) KoreanTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.KoreanTokenizerFactory) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) BasicModelUtils(org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) CBOW(org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW) KoreanTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.KoreanTokenizerFactory) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 3 with CBOW

use of org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW in project deeplearning4j by deeplearning4j.

the class Word2VecDataSetIteratorTest method testIterator1.

/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
@Test
public void testIterator1() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = // we make sure we'll have some missing words
    new Word2Vec.Builder().minWordFrequency(10).iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
    vec.fit();
    List<String> labels = new ArrayList<>();
    labels.add("positive");
    labels.add("negative");
    Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
    INDArray array = iterator.next().getFeatures();
    while (iterator.hasNext()) {
        DataSet ds = iterator.next();
        assertArrayEquals(array.shape(), ds.getFeatureMatrix().shape());
    }
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) DataSet(org.nd4j.linalg.dataset.DataSet) ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) LabelAwareSentenceIterator(org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) CBOW(org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW) File(java.io.File) Test(org.junit.Test)

Aggregations

CBOW (org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW)3 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)3 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)3 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)3 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)3 Test (org.junit.Test)3 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)2 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)2 File (java.io.File)1 ArrayList (java.util.ArrayList)1 ClassPathResource (org.datavec.api.util.ClassPathResource)1 BasicModelUtils (org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils)1 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)1 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)1 LabelAwareSentenceIterator (org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareSentenceIterator)1 KoreanTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.KoreanTokenizerFactory)1 Ignore (org.junit.Ignore)1 INDArray (org.nd4j.linalg.api.ndarray.INDArray)1 DataSet (org.nd4j.linalg.dataset.DataSet)1