use of org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testParaVecSerialization1.
@Test
public void testParaVecSerialization1() throws Exception {
VectorsConfiguration configuration = new VectorsConfiguration();
configuration.setIterations(14123);
configuration.setLayersSize(156);
INDArray syn0 = Nd4j.rand(100, configuration.getLayersSize());
INDArray syn1 = Nd4j.rand(100, configuration.getLayersSize());
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
for (int i = 0; i < 100; i++) {
VocabWord word = new VocabWord((float) i, "word_" + i);
List<Integer> points = new ArrayList<>();
List<Byte> codes = new ArrayList<>();
int num = org.apache.commons.lang3.RandomUtils.nextInt(1, 20);
for (int x = 0; x < num; x++) {
points.add(org.apache.commons.lang3.RandomUtils.nextInt(1, 100000));
codes.add(org.apache.commons.lang3.RandomUtils.nextBytes(10)[0]);
}
if (RandomUtils.nextInt(10) < 3) {
word.markAsLabel(true);
}
word.setIndex(i);
word.setPoints(points);
word.setCodes(codes);
cache.addToken(word);
cache.addWordToIndex(i, word.getLabel());
}
InMemoryLookupTable<VocabWord> lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(configuration.getLayersSize()).cache(cache).build();
lookupTable.setSyn0(syn0);
lookupTable.setSyn1(syn1);
ParagraphVectors originalVectors = new ParagraphVectors.Builder(configuration).vocabCache(cache).lookupTable(lookupTable).build();
File tempFile = File.createTempFile("paravec", "tests");
tempFile.deleteOnExit();
WordVectorSerializer.writeParagraphVectors(originalVectors, tempFile);
ParagraphVectors restoredVectors = WordVectorSerializer.readParagraphVectors(tempFile);
InMemoryLookupTable<VocabWord> restoredLookupTable = (InMemoryLookupTable<VocabWord>) restoredVectors.getLookupTable();
AbstractCache<VocabWord> restoredVocab = (AbstractCache<VocabWord>) restoredVectors.getVocab();
assertEquals(restoredLookupTable.getSyn0(), lookupTable.getSyn0());
assertEquals(restoredLookupTable.getSyn1(), lookupTable.getSyn1());
for (int i = 0; i < cache.numWords(); i++) {
assertEquals(cache.elementAtIndex(i).isLabel(), restoredVocab.elementAtIndex(i).isLabel());
assertEquals(cache.wordAtIndex(i), restoredVocab.wordAtIndex(i));
assertEquals(cache.elementAtIndex(i).getElementFrequency(), restoredVocab.elementAtIndex(i).getElementFrequency(), 0.1f);
List<Integer> originalPoints = cache.elementAtIndex(i).getPoints();
List<Integer> restoredPoints = restoredVocab.elementAtIndex(i).getPoints();
assertEquals(originalPoints.size(), restoredPoints.size());
for (int x = 0; x < originalPoints.size(); x++) {
assertEquals(originalPoints.get(x), restoredPoints.get(x));
}
List<Byte> originalCodes = cache.elementAtIndex(i).getCodes();
List<Byte> restoredCodes = restoredVocab.elementAtIndex(i).getCodes();
assertEquals(originalCodes.size(), restoredCodes.size());
for (int x = 0; x < originalCodes.size(); x++) {
assertEquals(originalCodes.get(x), restoredCodes.get(x));
}
}
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method loadTxtVectors.
/**
* This method can be used to load previously saved model from InputStream (like a HDFS-stream)
*
* Deprecation note: Please, consider using readWord2VecModel() or loadStaticModel() method instead
*
* @param stream InputStream that contains previously serialized model
* @param skipFirstLine Set this TRUE if first line contains csv header, FALSE otherwise
* @return
* @throws IOException
*/
@Deprecated
public static WordVectors loadTxtVectors(@NonNull InputStream stream, boolean skipFirstLine) throws IOException {
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = "";
List<INDArray> arrays = new ArrayList<>();
if (skipFirstLine)
reader.readLine();
while ((line = reader.readLine()) != null) {
String[] split = line.split(" ");
String word = split[0].replaceAll(whitespaceReplacement, " ");
VocabWord word1 = new VocabWord(1.0, word);
word1.setIndex(cache.numWords());
cache.addToken(word1);
cache.addWordToIndex(word1.getIndex(), word);
cache.putVocabWord(word);
float[] vector = new float[split.length - 1];
for (int i = 1; i < split.length; i++) {
vector[i - 1] = Float.parseFloat(split[i]);
}
INDArray row = Nd4j.create(vector);
arrays.add(row);
}
InMemoryLookupTable<VocabWord> lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(arrays.get(0).columns()).cache(cache).build();
INDArray syn = Nd4j.vstack(arrays);
Nd4j.clearNans(syn);
lookupTable.setSyn0(syn);
return fromPair(Pair.makePair((InMemoryLookupTable) lookupTable, (VocabCache) cache));
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readVocabCache.
/**
* This method reads vocab cache from provided InputStream.
* Please note: it reads only vocab content, so it's suitable mostly for BagOfWords/TF-IDF vectorizers
*
* @param stream
* @return
* @throws IOException
*/
public static VocabCache<VocabWord> readVocabCache(@NonNull InputStream stream) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
AbstractCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
VocabWordFactory factory = new VocabWordFactory();
String line = "";
while ((line = reader.readLine()) != null) {
VocabWord word = factory.deserialize(line);
vocabCache.addToken(word);
vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
}
return vocabCache;
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsDBOW.
@Test
public void testParagraphVectorsDBOW() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(2).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
double simDN = vec.similarity("day", "night");
log.info("day/night similariry: {}", simDN);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.2d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.2d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity3 > 0.6d);
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
// testing DM inference now
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray inferredA1 = vec.inferVector("This is my work");
INDArray inferredB1 = vec.inferVector("This is my work .");
INDArray inferredC1 = vec.inferVector("This is my day");
INDArray inferredD1 = vec.inferVector("This is my night");
log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));
assertNotEquals(inferredA1, inferredC1);
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
log.info("Cos A/C: {}", cosAC1);
log.info("Cos C/D: {}", cosCD1);
}
use of org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache in project deeplearning4j by deeplearning4j.
the class InMemoryLookupTableTest method testConsumeOnNonEqualVocabs.
@Test
public void testConsumeOnNonEqualVocabs() throws Exception {
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
BasicLineIterator underlyingIterator = new BasicLineIterator(resource.getFile());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();
vocabConstructor.buildJointVocabulary(false, true);
assertEquals(244, cacheSource.numWords());
InMemoryLookupTable<VocabWord> mem1 = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100).cache(cacheSource).build();
mem1.resetWeights(true);
AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();
FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();
sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();
vocabTransfer.buildMergedVocabulary(cacheSource, true);
// those +3 go for 3 additional entries in target VocabCache: labels
assertEquals(cacheSource.numWords() + 3, cacheTarget.numWords());
InMemoryLookupTable<VocabWord> mem2 = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100).cache(cacheTarget).seed(18).build();
mem2.resetWeights(true);
assertNotEquals(mem1.vector("day"), mem2.vector("day"));
mem2.consume(mem1);
assertEquals(mem1.vector("day"), mem2.vector("day"));
assertTrue(mem1.syn0.rows() < mem2.syn0.rows());
assertEquals(mem1.syn0.rows() + 3, mem2.syn0.rows());
}
Aggregations