use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.
the class Word2VecTests method testUnknown1.
@Test
public void testUnknown1() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true).unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100).stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001).sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5).useHierarchicSoftmax(true).allowParallelTokenization(true).modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertTrue(vec.hasWord("PEWPEW"));
assertTrue(vec.getVocab().containsWord("PEWPEW"));
INDArray unk = vec.getWordVectorMatrix("PEWPEW");
assertNotEquals(null, unk);
File tempFile = File.createTempFile("temp", "file");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
log.info("Original configuration: {}", vec.getConfiguration());
Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);
assertTrue(restored.hasWord("PEWPEW"));
assertTrue(restored.getVocab().containsWord("PEWPEW"));
INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");
assertEquals(unk, unk_restored);
// now we're getting some junk word
INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
log.info("Restored configuration: {}", restored.getConfiguration());
assertEquals(unk, random);
assertEquals(unk, randomRestored);
}
use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.
the class Word2VecIteratorTest method before.
@Before
public void before() throws Exception {
if (vec == null) {
ClassPathResource resource = new ClassPathResource("/labeled/");
File file = resource.getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(file.getAbsolutePath());
new File("cache.ser").delete();
TokenizerFactory t = new UimaTokenizerFactory();
vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100).stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
}
}
use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testIndexPersistence.
@Test
public void testIndexPersistence() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
VocabCache orig = vec.getVocab();
File tempFile = File.createTempFile("temp", "w2v");
tempFile.deleteOnExit();
WordVectorSerializer.writeWordVectors(vec, tempFile);
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);
VocabCache rest = vec2.vocab();
assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());
for (VocabWord word : vec.getVocab().vocabWords()) {
INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());
assertEquals(array1, array2);
}
}
use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.
the class PerformanceTests method testWord2VecCBOWBig.
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
//iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
//SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");
TokenizerFactory t = new KoreanTokenizerFactory();
//t = new DefaultTokenizerFactory();
//t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).allowParallelTokenization(true).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
long time1 = System.currentTimeMillis();
vec.fit();
long time2 = System.currentTimeMillis();
log.info("Total execution time: {}", (time2 - time1));
}
use of org.deeplearning4j.text.sentenceiterator.SentenceIterator in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsDM.
@Test
public void testParagraphVectorsDM() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(2).seed(119).epochs(3).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).useHierarchicSoftmax(true).sampling(0).workers(1).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
double simDN = vec.similarity("day", "night");
log.info("day/night similariry: {}", simDN);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.2d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.2d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity3 > 0.6d);
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
// testing DM inference now
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray inferredA1 = vec.inferVector("This is my work");
INDArray inferredB1 = vec.inferVector("This is my work .");
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
}
Aggregations