Search in sources :

Example 1 with SentenceIterator

use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.

the class UITest method testPosting.

@Test
public void testPosting() throws Exception {
    //        File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).epochs(1).layerSize(20).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    vec.fit();
    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();
    WordVectorSerializer.writeWordVectors(vec, tempFile);
    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
    //Initialize
    UIServer.getInstance();
    UiConnectionInfo uiConnectionInfo = new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build();
    BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2).setMaxIter(10).build();
    vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo);
    Thread.sleep(100000);
}
Also used : BarnesHutTsne(org.deeplearning4j.plot.BarnesHutTsne) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) UiConnectionInfo(org.deeplearning4j.ui.UiConnectionInfo) ArrayList(java.util.ArrayList) ClassPathResource(org.deeplearning4j.ui.standalone.ClassPathResource) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Test(org.junit.Test)

Example 2 with SentenceIterator

use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.

the class VectorsConfigurationTest method testFromW2V.

@Test
public void testFromW2V() throws Exception {
    VectorsConfiguration configuration = new VectorsConfiguration();
    configuration.setHugeModelExpected(true);
    configuration.setWindow(5);
    configuration.setIterations(3);
    configuration.setLayersSize(200);
    configuration.setLearningRate(1.4d);
    configuration.setSampling(0.0005d);
    configuration.setMinLearningRate(0.25d);
    configuration.setEpochs(1);
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    Word2Vec vec = new Word2Vec.Builder(configuration).iterate(iter).build();
    VectorsConfiguration configuration2 = vec.getConfiguration();
    assertEquals(configuration, configuration2);
}
Also used : Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Test(org.junit.Test)

Example 3 with SentenceIterator

use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.

the class Word2VecTests method testUIMAIterator.

@Test
public void testUIMAIterator() throws Exception {
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    assertEquals(iter.nextSentence(), "No ,  he says now .");
}
Also used : SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) Test(org.junit.Test)

Example 4 with SentenceIterator

use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.

the class Word2VecTests method testWord2VecCBOW.

@Test
public void testWord2VecCBOW() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
    vec.fit();
    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));
    //   assertEquals(10, lst.size());
    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);
    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) CBOW(org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW) Test(org.junit.Test)

Example 5 with SentenceIterator

use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.

the class Word2VecTests method testW2VnegativeOnRestore.

@Test
public void testW2VnegativeOnRestore() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100).stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001).sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1).windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true).modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
    assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());
    log.info("Fit 1");
    vec.fit();
    File tmpFile = File.createTempFile("temp", "file");
    tmpFile.deleteOnExit();
    WordVectorSerializer.writeWord2VecModel(vec, tmpFile);
    iter.reset();
    Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);
    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
    assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());
    log.info("Fit 2");
    restoredVec.fit();
    iter.reset();
    restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);
    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);
    log.info("Fit 3");
    restoredVec.fit();
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) SkipGram(org.deeplearning4j.models.embeddings.learning.impl.elements.SkipGram) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) BasicModelUtils(org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils) FlatModelUtils(org.deeplearning4j.models.embeddings.reader.impl.FlatModelUtils) File(java.io.File) Test(org.junit.Test)

Aggregations

SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)33 Test (org.junit.Test)31 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)27 File (java.io.File)23 ClassPathResource (org.datavec.api.util.ClassPathResource)23 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)22 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)21 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)20 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)13 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)12 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)10 ArrayList (java.util.ArrayList)7 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)5 LabelsSource (org.deeplearning4j.text.documentiterator.LabelsSource)5 SkipGram (org.deeplearning4j.models.embeddings.learning.impl.elements.SkipGram)4 BasicModelUtils (org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils)4 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)4