use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsDM.
@Test
public void testParagraphVectorsDM() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(2).seed(119).epochs(3).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).useHierarchicSoftmax(true).sampling(0).workers(1).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
double simDN = vec.similarity("day", "night");
log.info("day/night similariry: {}", simDN);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.2d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.2d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity3 > 0.6d);
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
// testing DM inference now
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray inferredA1 = vec.inferVector("This is my work");
INDArray inferredB1 = vec.inferVector("This is my work .");
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
}
use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsWithWordVectorsModelling1.
@Test
public void testParagraphVectorsWithWordVectorsModelling1() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
// InMemoryLookupCache cache = new InMemoryLookupCache(false);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(3).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).sampling(0).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
/*
We have few lines that contain pretty close words invloved.
These sentences should be pretty close to each other in vector space
*/
// line 3721: This is my way .
// line 6348: This is my case .
// line 9836: This is my house .
// line 12493: This is my world .
// line 16393: This is my work .
// this is special sentence, that has nothing common with previous sentences
// line 9853: We now have one .
assertTrue(vec.hasWord("DOC_3720"));
double similarityD = vec.similarity("day", "night");
log.info("day/night similarity: " + similarityD);
double similarityW = vec.similarity("way", "work");
log.info("way/work similarity: " + similarityW);
double similarityH = vec.similarity("house", "world");
log.info("house/world similarity: " + similarityH);
double similarityC = vec.similarity("case", "way");
log.info("case/way similarity: " + similarityC);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.7d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.7d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity2 > 0.7d);
// likelihood in this case should be significantly lower
// however, since corpus is small, and weight initialization is random-based, sometimes this test CAN fail
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
double sim119 = vec.similarityToLabel("This is my case .", "DOC_6347");
double sim120 = vec.similarityToLabel("This is my case .", "DOC_3720");
log.info("1/2: " + sim119 + "/" + sim120);
//assertEquals(similarity3, sim119, 0.001);
}
use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readParagraphVectorsFromText.
/**
* Restores previously serialized ParagraphVectors model
*
* Deprecation note: Please, consider using readParagraphVectors() method instead
*
* @param stream InputStream that contains previously serialized model
* @return
*/
@Deprecated
public static ParagraphVectors readParagraphVectorsFromText(@NonNull InputStream stream) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
ArrayList<String> labels = new ArrayList<>();
ArrayList<INDArray> arrays = new ArrayList<>();
VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
String line = "";
while ((line = reader.readLine()) != null) {
String[] split = line.split(" ");
split[1] = split[1].replaceAll(whitespaceReplacement, " ");
VocabWord word = new VocabWord(1.0, split[1]);
if (split[0].equals("L")) {
// we have label element here
word.setSpecial(true);
word.markAsLabel(true);
labels.add(word.getLabel());
} else if (split[0].equals("E")) {
// we have usual element, aka word here
word.setSpecial(false);
word.markAsLabel(false);
} else
throw new IllegalStateException("Source stream doesn't looks like ParagraphVectors serialized model");
// this particular line is just for backward compatibility with InMemoryLookupCache
word.setIndex(vocabCache.numWords());
vocabCache.addToken(word);
vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
// backward compatibility code
vocabCache.putVocabWord(word.getLabel());
float[] vector = new float[split.length - 2];
for (int i = 2; i < split.length; i++) {
vector[i - 2] = Float.parseFloat(split[i]);
}
INDArray row = Nd4j.create(vector);
arrays.add(row);
}
// now we create syn0 matrix, using previously fetched rows
/*INDArray syn = Nd4j.create(new int[]{arrays.size(), arrays.get(0).columns()});
for (int i = 0; i < syn.rows(); i++) {
syn.putRow(i, arrays.get(i));
}*/
INDArray syn = Nd4j.vstack(arrays);
InMemoryLookupTable<VocabWord> lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(arrays.get(0).columns()).useAdaGrad(false).cache(vocabCache).build();
Nd4j.clearNans(syn);
lookupTable.setSyn0(syn);
LabelsSource source = new LabelsSource(labels);
ParagraphVectors vectors = new ParagraphVectors.Builder().labelsSource(source).vocabCache(vocabCache).lookupTable(lookupTable).modelUtils(new BasicModelUtils<VocabWord>()).build();
try {
reader.close();
} catch (Exception e) {
}
vectors.extractLabels();
return vectors;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsDBOW.
@Test
public void testParagraphVectorsDBOW() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(2).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
double simDN = vec.similarity("day", "night");
log.info("day/night similariry: {}", simDN);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.2d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.2d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity3 > 0.6d);
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
// testing DM inference now
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray inferredA1 = vec.inferVector("This is my work");
INDArray inferredB1 = vec.inferVector("This is my work .");
INDArray inferredC1 = vec.inferVector("This is my day");
INDArray inferredD1 = vec.inferVector("This is my night");
log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));
assertNotEquals(inferredA1, inferredC1);
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
log.info("Cos A/C: {}", cosAC1);
log.info("Cos C/D: {}", cosCD1);
}
use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsVocabBuilding1.
/*
@Test
public void testWord2VecRunThroughVectors() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile().getParentFile();
LabelAwareSentenceIterator iter = LabelAwareUimaSentenceIterator.createWithPath(file.getAbsolutePath());
TokenizerFactory t = new UimaTokenizerFactory();
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1).iterations(5).labels(Arrays.asList("label1", "deeple"))
.layerSize(100)
.stopWords(new ArrayList<String>())
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
double sim = vec.similarity("day","night");
log.info("day/night similarity: " + sim);
new File("cache.ser").delete();
}
*/
/**
* This test checks, how vocab is built using SentenceIterator provided, without labels.
*
* @throws Exception
*/
@Test
public void testParagraphVectorsVocabBuilding1() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
//.getParentFile();
File file = resource.getFile();
//UimaSentenceIterator.createWithPath(file.getAbsolutePath());
SentenceIterator iter = new BasicLineIterator(file);
int numberOfLines = 0;
while (iter.hasNext()) {
iter.nextSentence();
numberOfLines++;
}
iter.reset();
InMemoryLookupCache cache = new InMemoryLookupCache(false);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
// LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).layerSize(100).windowSize(5).iterate(iter).vocabCache(cache).tokenizerFactory(t).build();
vec.buildVocab();
LabelsSource source = vec.getLabelsSource();
//VocabCache cache = vec.getVocab();
log.info("Number of lines in corpus: " + numberOfLines);
assertEquals(numberOfLines, source.getLabels().size());
assertEquals(97162, source.getLabels().size());
assertNotEquals(null, cache);
assertEquals(97406, cache.numWords());
// proper number of words for minWordsFrequency = 1 is 244
assertEquals(244, cache.numWords() - source.getLabels().size());
}
Aggregations