use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class BinaryCoOccurrenceReaderTest method testHasMoreObjects1.
@Test
public void testHasMoreObjects1() throws Exception {
File tempFile = File.createTempFile("tmp", "tmp");
tempFile.deleteOnExit();
VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
VocabWord word1 = new VocabWord(1.0, "human");
VocabWord word2 = new VocabWord(2.0, "animal");
VocabWord word3 = new VocabWord(3.0, "unknown");
vocabCache.addToken(word1);
vocabCache.addToken(word2);
vocabCache.addToken(word3);
Huffman huffman = new Huffman(vocabCache.vocabWords());
huffman.build();
huffman.applyIndexes(vocabCache);
BinaryCoOccurrenceWriter<VocabWord> writer = new BinaryCoOccurrenceWriter<>(tempFile);
CoOccurrenceWeight<VocabWord> object1 = new CoOccurrenceWeight<>();
object1.setElement1(word1);
object1.setElement2(word2);
object1.setWeight(3.14159265);
writer.writeObject(object1);
CoOccurrenceWeight<VocabWord> object2 = new CoOccurrenceWeight<>();
object2.setElement1(word2);
object2.setElement2(word3);
object2.setWeight(0.197);
writer.writeObject(object2);
writer.finish();
BinaryCoOccurrenceReader<VocabWord> reader = new BinaryCoOccurrenceReader<>(tempFile, vocabCache, null);
CoOccurrenceWeight<VocabWord> r1 = reader.nextObject();
log.info("Object received: " + r1);
assertNotEquals(null, r1);
r1 = reader.nextObject();
log.info("Object received: " + r1);
assertNotEquals(null, r1);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testHash.
@Test
public void testHash() {
VocabWord w1 = new VocabWord(1.0, "D1");
VocabWord w2 = new VocabWord(1.0, "Bo");
log.info("W1 > Short hash: {}; Long hash: {}", w1.getLabel().hashCode(), w1.getStorageId());
log.info("W2 > Short hash: {}; Long hash: {}", w2.getLabel().hashCode(), w2.getStorageId());
assertNotEquals(w1.getStorageId(), w2.getStorageId());
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsDBOW.
@Test
public void testParagraphVectorsDBOW() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(2).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
double simDN = vec.similarity("day", "night");
log.info("day/night similariry: {}", simDN);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.2d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.2d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity3 > 0.6d);
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
// testing DM inference now
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray inferredA1 = vec.inferVector("This is my work");
INDArray inferredB1 = vec.inferVector("This is my work .");
INDArray inferredC1 = vec.inferVector("This is my day");
INDArray inferredD1 = vec.inferVector("This is my night");
log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));
assertNotEquals(inferredA1, inferredC1);
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
log.info("Cos A/C: {}", cosAC1);
log.info("Cos C/D: {}", cosCD1);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsModelling1.
/**
* This test doesn't really cares about actual results. We only care about equality between live model & restored models
*
* @throws Exception
*/
@Test
public void testParagraphVectorsModelling1() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1).layerSize(150).learningRate(0.025).labelsSource(source).windowSize(5).sequenceLearningAlgorithm(new DM<VocabWord>()).iterate(iter).trainWordVectors(true).tokenizerFactory(t).workers(4).sampling(0).build();
vec.fit();
VocabCache<VocabWord> cache = vec.getVocab();
File fullFile = File.createTempFile("paravec", "tests");
fullFile.deleteOnExit();
INDArray originalSyn1_17 = ((InMemoryLookupTable) vec.getLookupTable()).getSyn1().getRow(17).dup();
WordVectorSerializer.writeParagraphVectors(vec, fullFile);
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
assertEquals(97406, cache.numWords());
assertTrue(vec.hasWord("DOC_16392"));
assertTrue(vec.hasWord("DOC_3720"));
List<String> result = new ArrayList<>(vec.nearestLabels(vec.getWordVectorMatrix("DOC_16392"), 10));
System.out.println("nearest labels: " + result);
for (String label : result) {
System.out.println(label + "/DOC_16392: " + vec.similarity(label, "DOC_16392"));
}
assertTrue(result.contains("DOC_16392"));
//assertTrue(result.contains("DOC_21383"));
/*
We have few lines that contain pretty close words invloved.
These sentences should be pretty close to each other in vector space
*/
// line 3721: This is my way .
// line 6348: This is my case .
// line 9836: This is my house .
// line 12493: This is my world .
// line 16393: This is my work .
// this is special sentence, that has nothing common with previous sentences
// line 9853: We now have one .
double similarityD = vec.similarity("day", "night");
log.info("day/night similarity: " + similarityD);
if (similarityD < 0.0) {
log.info("Day: " + Arrays.toString(vec.getWordVectorMatrix("day").dup().data().asDouble()));
log.info("Night: " + Arrays.toString(vec.getWordVectorMatrix("night").dup().data().asDouble()));
}
List<String> labelsOriginal = vec.labelsSource.getLabels();
double similarityW = vec.similarity("way", "work");
log.info("way/work similarity: " + similarityW);
double similarityH = vec.similarity("house", "world");
log.info("house/world similarity: " + similarityH);
double similarityC = vec.similarity("case", "way");
log.info("case/way similarity: " + similarityC);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.7d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.7d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity2 > 0.7d);
// likelihood in this case should be significantly lower
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
File tempFile = File.createTempFile("paravec", "ser");
tempFile.deleteOnExit();
INDArray day = vec.getWordVectorMatrix("day").dup();
/*
Testing txt serialization
*/
File tempFile2 = File.createTempFile("paravec", "ser");
tempFile2.deleteOnExit();
WordVectorSerializer.writeWordVectors(vec, tempFile2);
ParagraphVectors vec3 = WordVectorSerializer.readParagraphVectorsFromText(tempFile2);
INDArray day3 = vec3.getWordVectorMatrix("day").dup();
List<String> labelsRestored = vec3.labelsSource.getLabels();
assertEquals(day, day3);
assertEquals(labelsOriginal.size(), labelsRestored.size());
/*
Testing binary serialization
*/
SerializationUtils.saveObject(vec, tempFile);
ParagraphVectors vec2 = (ParagraphVectors) SerializationUtils.readObject(tempFile);
INDArray day2 = vec2.getWordVectorMatrix("day").dup();
List<String> labelsBinary = vec2.labelsSource.getLabels();
assertEquals(day, day2);
tempFile.delete();
assertEquals(labelsOriginal.size(), labelsBinary.size());
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray originalPreserved = original.dup();
INDArray inferredA1 = vec.inferVector("This is my work .");
INDArray inferredB1 = vec.inferVector("This is my work .");
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
// assertTrue(cosAO1 > 0.45);
assertTrue(cosAB1 > 0.95);
//assertArrayEquals(inferredA.data().asDouble(), inferredB.data().asDouble(), 0.01);
ParagraphVectors restoredVectors = WordVectorSerializer.readParagraphVectors(fullFile);
restoredVectors.setTokenizerFactory(t);
INDArray restoredSyn1_17 = ((InMemoryLookupTable) restoredVectors.getLookupTable()).getSyn1().getRow(17).dup();
assertEquals(originalSyn1_17, restoredSyn1_17);
INDArray originalRestored = vec.getWordVectorMatrix("DOC_16392").dup();
assertEquals(originalPreserved, originalRestored);
INDArray inferredA2 = restoredVectors.inferVector("This is my work .");
INDArray inferredB2 = restoredVectors.inferVector("This is my work .");
INDArray inferredC2 = restoredVectors.inferVector("world way case .");
double cosAO2 = Transforms.cosineSim(inferredA2.dup(), original.dup());
double cosAB2 = Transforms.cosineSim(inferredA2.dup(), inferredB2.dup());
double cosAAX = Transforms.cosineSim(inferredA1.dup(), inferredA2.dup());
double cosAC2 = Transforms.cosineSim(inferredC2.dup(), inferredA2.dup());
log.info("Cos A2/B2: {}", cosAB2);
log.info("Cos A1/A2: {}", cosAAX);
log.info("Cos O/A2: {}", cosAO2);
log.info("Cos C2/A2: {}", cosAC2);
log.info("Vector: {}", Arrays.toString(inferredA1.data().asFloat()));
log.info("cosAO2: {}", cosAO2);
// assertTrue(cosAO2 > 0.45);
assertTrue(cosAB2 > 0.95);
assertTrue(cosAAX > 0.95);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsOverExistingWordVectorsModel.
/*
In this test we'll build w2v model, and will use it's vocab and weights for ParagraphVectors.
there's no need in this test within travis, use it manually only for problems detection
*/
@Test
public void testParagraphVectorsOverExistingWordVectorsModel() throws Exception {
// we build w2v from multiple sources, to cover everything
ClassPathResource resource_sentences = new ClassPathResource("/big/raw_sentences.txt");
ClassPathResource resource_mixed = new ClassPathResource("/paravec");
SentenceIterator iter = new AggregatingSentenceIterator.Builder().addSentenceIterator(new BasicLineIterator(resource_sentences.getFile())).addSentenceIterator(new FileSentenceIterator(resource_mixed.getFile())).build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(3).learningRate(0.025).layerSize(150).minLearningRate(0.001).elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5).iterate(iter).tokenizerFactory(t).build();
wordVectors.fit();
VocabWord day_A = wordVectors.getVocab().tokenFor("day");
INDArray vector_day1 = wordVectors.getWordVectorMatrix("day").dup();
// At this moment we have ready w2v model. It's time to use it for ParagraphVectors
FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
// documents from this iterator will be used for classification
FileLabelAwareIterator unlabeledIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/unlabeled").getFile()).build();
// we're building classifier now, with pre-built w2v model passed in
ParagraphVectors paragraphVectors = new ParagraphVectors.Builder().iterate(labelAwareIterator).learningRate(0.025).minLearningRate(0.001).iterations(5).epochs(1).layerSize(150).tokenizerFactory(t).sequenceLearningAlgorithm(new DBOW<VocabWord>()).useHierarchicSoftmax(true).trainWordVectors(false).useExistingWordVectors(wordVectors).build();
paragraphVectors.fit();
VocabWord day_B = paragraphVectors.getVocab().tokenFor("day");
assertEquals(day_A.getIndex(), day_B.getIndex());
/*
double similarityD = wordVectors.similarity("day", "night");
log.info("day/night similarity: " + similarityD);
assertTrue(similarityD > 0.5d);
*/
INDArray vector_day2 = paragraphVectors.getWordVectorMatrix("day").dup();
double crossDay = arraysSimilarity(vector_day1, vector_day2);
log.info("Day1: " + vector_day1);
log.info("Day2: " + vector_day2);
log.info("Cross-Day similarity: " + crossDay);
log.info("Cross-Day similiarity 2: " + Transforms.cosineSim(vector_day1, vector_day2));
assertTrue(crossDay > 0.9d);
/**
*
* Here we're checking cross-vocabulary equality
*
*/
/*
Random rnd = new Random();
VocabCache<VocabWord> cacheP = paragraphVectors.getVocab();
VocabCache<VocabWord> cacheW = wordVectors.getVocab();
for (int x = 0; x < 1000; x++) {
int idx = rnd.nextInt(cacheW.numWords());
String wordW = cacheW.wordAtIndex(idx);
String wordP = cacheP.wordAtIndex(idx);
assertEquals(wordW, wordP);
INDArray arrayW = wordVectors.getWordVectorMatrix(wordW);
INDArray arrayP = paragraphVectors.getWordVectorMatrix(wordP);
double simWP = Transforms.cosineSim(arrayW, arrayP);
assertTrue(simWP >= 0.9);
}
*/
log.info("Zfinance: " + paragraphVectors.getWordVectorMatrix("Zfinance"));
log.info("Zhealth: " + paragraphVectors.getWordVectorMatrix("Zhealth"));
log.info("Zscience: " + paragraphVectors.getWordVectorMatrix("Zscience"));
LabelledDocument document = unlabeledIterator.nextDocument();
log.info("Results for document '" + document.getLabel() + "'");
List<String> results = new ArrayList<>(paragraphVectors.predictSeveral(document, 3));
for (String result : results) {
double sim = paragraphVectors.similarityToLabel(document, result);
log.info("Similarity to [" + result + "] is [" + sim + "]");
}
String topPrediction = paragraphVectors.predict(document);
assertEquals("Zfinance", topPrediction);
}
Aggregations