use of org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW in project deeplearning4j by deeplearning4j.
the class Word2VecTests method testWord2VecCBOW.
@Test
public void testWord2VecCBOW() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertTrue(sim > 0.65f);
}
use of org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW in project deeplearning4j by deeplearning4j.
the class PerformanceTests method testWord2VecCBOWBig.
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
//iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
//SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");
TokenizerFactory t = new KoreanTokenizerFactory();
//t = new DefaultTokenizerFactory();
//t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).allowParallelTokenization(true).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
long time1 = System.currentTimeMillis();
vec.fit();
long time2 = System.currentTimeMillis();
log.info("Total execution time: {}", (time2 - time1));
}
use of org.deeplearning4j.models.embeddings.learning.impl.elements.CBOW in project deeplearning4j by deeplearning4j.
the class Word2VecDataSetIteratorTest method testIterator1.
/**
* Basically all we want from this test - being able to finish without exceptions.
*/
@Test
public void testIterator1() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = // we make sure we'll have some missing words
new Word2Vec.Builder().minWordFrequency(10).iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
List<String> labels = new ArrayList<>();
labels.add("positive");
labels.add("negative");
Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
INDArray array = iterator.next().getFeatures();
while (iterator.hasNext()) {
DataSet ds = iterator.next();
assertArrayEquals(array.shape(), ds.getFeatureMatrix().shape());
}
}
Aggregations