use of org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor in project deeplearning4j by deeplearning4j.
the class UITest method testPosting.
@Test
public void testPosting() throws Exception {
// File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).epochs(1).layerSize(20).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
File tempFile = File.createTempFile("temp", "w2v");
tempFile.deleteOnExit();
WordVectorSerializer.writeWordVectors(vec, tempFile);
WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
//Initialize
UIServer.getInstance();
UiConnectionInfo uiConnectionInfo = new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build();
BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2).setMaxIter(10).build();
vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo);
Thread.sleep(100000);
}
use of org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor in project deeplearning4j by deeplearning4j.
the class Word2VecTests method testWord2VecCBOW.
@Test
public void testWord2VecCBOW() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertTrue(sim > 0.65f);
}
use of org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor in project deeplearning4j by deeplearning4j.
the class Word2VecTests method testW2VnegativeOnRestore.
@Test
public void testW2VnegativeOnRestore() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100).stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001).sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1).windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true).modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());
log.info("Fit 1");
vec.fit();
File tmpFile = File.createTempFile("temp", "file");
tmpFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tmpFile);
iter.reset();
Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());
log.info("Fit 2");
restoredVec.fit();
iter.reset();
restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);
log.info("Fit 3");
restoredVec.fit();
}
use of org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor in project deeplearning4j by deeplearning4j.
the class Word2VecTests method testUnknown1.
@Test
public void testUnknown1() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true).unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100).stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001).sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5).useHierarchicSoftmax(true).allowParallelTokenization(true).modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertTrue(vec.hasWord("PEWPEW"));
assertTrue(vec.getVocab().containsWord("PEWPEW"));
INDArray unk = vec.getWordVectorMatrix("PEWPEW");
assertNotEquals(null, unk);
File tempFile = File.createTempFile("temp", "file");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
log.info("Original configuration: {}", vec.getConfiguration());
Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);
assertTrue(restored.hasWord("PEWPEW"));
assertTrue(restored.getVocab().containsWord("PEWPEW"));
INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");
assertEquals(unk, unk_restored);
// now we're getting some junk word
INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
log.info("Restored configuration: {}", restored.getConfiguration());
assertEquals(unk, random);
assertEquals(unk, randomRestored);
}
use of org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testIndexPersistence.
@Test
public void testIndexPersistence() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
VocabCache orig = vec.getVocab();
File tempFile = File.createTempFile("temp", "w2v");
tempFile.deleteOnExit();
WordVectorSerializer.writeWordVectors(vec, tempFile);
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);
VocabCache rest = vec2.vocab();
assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());
for (VocabWord word : vec.getVocab().vocabWords()) {
INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());
assertEquals(array1, array2);
}
}
Aggregations