use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testZipFunction2.
@Test
public void testZipFunction2() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
// word2vec.setRemoveStop(false);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
pipeline.buildVocabWordListRDD();
JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();
List<VocabWord> vocabWordsList1 = lst.get(0)._1();
Long cumSumSize1 = lst.get(0)._2();
assertEquals(6, vocabWordsList1.size());
assertEquals(vocabWordsList1.get(0).getWord(), "this");
assertEquals(vocabWordsList1.get(1).getWord(), "is");
assertEquals(vocabWordsList1.get(2).getWord(), "a");
assertEquals(vocabWordsList1.get(3).getWord(), "strange");
assertEquals(vocabWordsList1.get(4).getWord(), "strange");
assertEquals(vocabWordsList1.get(5).getWord(), "world");
assertEquals(cumSumSize1, 6L, 0);
List<VocabWord> vocabWordsList2 = lst.get(1)._1();
Long cumSumSize2 = lst.get(1)._2();
assertEquals(vocabWordsList2.size(), 3);
assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
assertEquals(vocabWordsList2.get(1).getWord(), "are");
assertEquals(vocabWordsList2.get(2).getWord(), "red");
assertEquals(cumSumSize2, 9L, 0);
sc.stop();
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testHuffman.
@Test
public void testHuffman() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
Huffman huffman = new Huffman(vocabCache.vocabWords());
huffman.build();
huffman.applyIndexes(vocabCache);
Collection<VocabWord> vocabWords = vocabCache.vocabWords();
System.out.println("Huffman Test:");
for (VocabWord vocabWord : vocabWords) {
System.out.println("Word: " + vocabWord);
System.out.println(vocabWord.getCodes());
System.out.println(vocabWord.getPoints());
}
sc.stop();
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testBuildVocabCache.
@Test
public void testBuildVocabCache() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
assertTrue(vocabCache != null);
log.info("VocabWords: " + vocabCache.words());
assertEquals(5, vocabCache.numWords());
VocabWord redVocab = vocabCache.tokenFor("red");
VocabWord flowerVocab = vocabCache.tokenFor("flowers");
VocabWord worldVocab = vocabCache.tokenFor("world");
VocabWord strangeVocab = vocabCache.tokenFor("strange");
log.info("Red word: " + redVocab);
log.info("Flower word: " + flowerVocab);
log.info("World word: " + worldVocab);
log.info("Strange word: " + strangeVocab);
assertEquals(redVocab.getWord(), "red");
assertEquals(redVocab.getElementFrequency(), 1, 0);
assertEquals(flowerVocab.getWord(), "flowers");
assertEquals(flowerVocab.getElementFrequency(), 1, 0);
assertEquals(worldVocab.getWord(), "world");
assertEquals(worldVocab.getElementFrequency(), 1, 0);
assertEquals(strangeVocab.getWord(), "strange");
assertEquals(strangeVocab.getElementFrequency(), 2, 0);
sc.stop();
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testParaVecSerialization1.
@Test
public void testParaVecSerialization1() throws Exception {
VectorsConfiguration configuration = new VectorsConfiguration();
configuration.setIterations(14123);
configuration.setLayersSize(156);
INDArray syn0 = Nd4j.rand(100, configuration.getLayersSize());
INDArray syn1 = Nd4j.rand(100, configuration.getLayersSize());
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
for (int i = 0; i < 100; i++) {
VocabWord word = new VocabWord((float) i, "word_" + i);
List<Integer> points = new ArrayList<>();
List<Byte> codes = new ArrayList<>();
int num = org.apache.commons.lang3.RandomUtils.nextInt(1, 20);
for (int x = 0; x < num; x++) {
points.add(org.apache.commons.lang3.RandomUtils.nextInt(1, 100000));
codes.add(org.apache.commons.lang3.RandomUtils.nextBytes(10)[0]);
}
if (RandomUtils.nextInt(10) < 3) {
word.markAsLabel(true);
}
word.setIndex(i);
word.setPoints(points);
word.setCodes(codes);
cache.addToken(word);
cache.addWordToIndex(i, word.getLabel());
}
InMemoryLookupTable<VocabWord> lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(configuration.getLayersSize()).cache(cache).build();
lookupTable.setSyn0(syn0);
lookupTable.setSyn1(syn1);
ParagraphVectors originalVectors = new ParagraphVectors.Builder(configuration).vocabCache(cache).lookupTable(lookupTable).build();
File tempFile = File.createTempFile("paravec", "tests");
tempFile.deleteOnExit();
WordVectorSerializer.writeParagraphVectors(originalVectors, tempFile);
ParagraphVectors restoredVectors = WordVectorSerializer.readParagraphVectors(tempFile);
InMemoryLookupTable<VocabWord> restoredLookupTable = (InMemoryLookupTable<VocabWord>) restoredVectors.getLookupTable();
AbstractCache<VocabWord> restoredVocab = (AbstractCache<VocabWord>) restoredVectors.getVocab();
assertEquals(restoredLookupTable.getSyn0(), lookupTable.getSyn0());
assertEquals(restoredLookupTable.getSyn1(), lookupTable.getSyn1());
for (int i = 0; i < cache.numWords(); i++) {
assertEquals(cache.elementAtIndex(i).isLabel(), restoredVocab.elementAtIndex(i).isLabel());
assertEquals(cache.wordAtIndex(i), restoredVocab.wordAtIndex(i));
assertEquals(cache.elementAtIndex(i).getElementFrequency(), restoredVocab.elementAtIndex(i).getElementFrequency(), 0.1f);
List<Integer> originalPoints = cache.elementAtIndex(i).getPoints();
List<Integer> restoredPoints = restoredVocab.elementAtIndex(i).getPoints();
assertEquals(originalPoints.size(), restoredPoints.size());
for (int x = 0; x < originalPoints.size(); x++) {
assertEquals(originalPoints.get(x), restoredPoints.get(x));
}
List<Byte> originalCodes = cache.elementAtIndex(i).getCodes();
List<Byte> restoredCodes = restoredVocab.elementAtIndex(i).getCodes();
assertEquals(originalCodes.size(), restoredCodes.size());
for (int x = 0; x < originalCodes.size(); x++) {
assertEquals(originalCodes.get(x), restoredCodes.get(x));
}
}
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testFullModelSerialization.
@Test
public void testFullModelSerialization() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache(false);
WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0).cache(cache).lr(0.025f).build();
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).vocabCache(cache).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
//logger.info("Original word 0: " + cache.wordFor(cache.wordAtIndex(0)));
//logger.info("Closest Words:");
Collection<String> lst = vec.wordsNearest("day", 10);
System.out.println(lst);
WordVectorSerializer.writeFullModel(vec, "tempModel.txt");
File modelFile = new File("tempModel.txt");
modelFile.deleteOnExit();
assertTrue(modelFile.exists());
assertTrue(modelFile.length() > 0);
Word2Vec vec2 = WordVectorSerializer.loadFullModel("tempModel.txt");
assertNotEquals(null, vec2);
assertEquals(vec.getConfiguration(), vec2.getConfiguration());
//logger.info("Source ExpTable: " + ArrayUtils.toString(((InMemoryLookupTable) table).getExpTable()));
//logger.info("Dest ExpTable: " + ArrayUtils.toString(((InMemoryLookupTable) vec2.getLookupTable()).getExpTable()));
assertTrue(ArrayUtils.isEquals(((InMemoryLookupTable) table).getExpTable(), ((InMemoryLookupTable) vec2.getLookupTable()).getExpTable()));
InMemoryLookupTable restoredTable = (InMemoryLookupTable) vec2.lookupTable();
/*
logger.info("Restored word 1: " + restoredTable.getVocab().wordFor(restoredTable.getVocab().wordAtIndex(1)));
logger.info("Restored word 'it': " + restoredTable.getVocab().wordFor("it"));
logger.info("Original word 1: " + cache.wordFor(cache.wordAtIndex(1)));
logger.info("Original word 'i': " + cache.wordFor("i"));
logger.info("Original word 0: " + cache.wordFor(cache.wordAtIndex(0)));
logger.info("Restored word 0: " + restoredTable.getVocab().wordFor(restoredTable.getVocab().wordAtIndex(0)));
*/
assertEquals(cache.wordAtIndex(1), restoredTable.getVocab().wordAtIndex(1));
assertEquals(cache.wordAtIndex(7), restoredTable.getVocab().wordAtIndex(7));
assertEquals(cache.wordAtIndex(15), restoredTable.getVocab().wordAtIndex(15));
/*
these tests needed only to make sure INDArray equality is working properly
*/
double[] array1 = new double[] { 0.323232325, 0.65756575, 0.12315, 0.12312315, 0.1232135, 0.12312315, 0.4343423425, 0.15 };
double[] array2 = new double[] { 0.423232325, 0.25756575, 0.12375, 0.12311315, 0.1232035, 0.12318315, 0.4343493425, 0.25 };
assertNotEquals(Nd4j.create(array1), Nd4j.create(array2));
assertEquals(Nd4j.create(array1), Nd4j.create(array1));
INDArray rSyn0_1 = restoredTable.getSyn0().slice(1);
INDArray oSyn0_1 = ((InMemoryLookupTable) table).getSyn0().slice(1);
//logger.info("Restored syn0: " + rSyn0_1);
//logger.info("Original syn0: " + oSyn0_1);
assertEquals(oSyn0_1, rSyn0_1);
// just checking $^###! syn0/syn1 order
int cnt = 0;
for (VocabWord word : cache.vocabWords()) {
INDArray rSyn0 = restoredTable.getSyn0().slice(word.getIndex());
INDArray oSyn0 = ((InMemoryLookupTable) table).getSyn0().slice(word.getIndex());
assertEquals(rSyn0, oSyn0);
assertEquals(1.0, arraysSimilarity(rSyn0, oSyn0), 0.001);
INDArray rSyn1 = restoredTable.getSyn1().slice(word.getIndex());
INDArray oSyn1 = ((InMemoryLookupTable) table).getSyn1().slice(word.getIndex());
assertEquals(rSyn1, oSyn1);
if (arraysSimilarity(rSyn1, oSyn1) < 0.98) {
// logger.info("Restored syn1: " + rSyn1);
// logger.info("Original syn1: " + oSyn1);
}
// we exclude word 222 since it has syn1 full of zeroes
if (cnt != 222)
assertEquals(1.0, arraysSimilarity(rSyn1, oSyn1), 0.001);
if (((InMemoryLookupTable) table).getSyn1Neg() != null) {
INDArray rSyn1Neg = restoredTable.getSyn1Neg().slice(word.getIndex());
INDArray oSyn1Neg = ((InMemoryLookupTable) table).getSyn1Neg().slice(word.getIndex());
assertEquals(rSyn1Neg, oSyn1Neg);
// assertEquals(1.0, arraysSimilarity(rSyn1Neg, oSyn1Neg), 0.001);
}
assertEquals(word.getHistoricalGradient(), restoredTable.getVocab().wordFor(word.getWord()).getHistoricalGradient());
cnt++;
}
// at this moment we can assume that whole model is transferred, and we can call fit over new model
// iter.reset();
iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
vec2.setTokenizerFactory(t);
vec2.setSentenceIterator(iter);
vec2.fit();
INDArray day1 = vec.getWordVectorMatrix("day");
INDArray day2 = vec2.getWordVectorMatrix("day");
INDArray night1 = vec.getWordVectorMatrix("night");
INDArray night2 = vec2.getWordVectorMatrix("night");
double simD = arraysSimilarity(day1, day2);
double simN = arraysSimilarity(night1, night2);
logger.info("Vec1 day: " + day1);
logger.info("Vec2 day: " + day2);
logger.info("Vec1 night: " + night1);
logger.info("Vec2 night: " + night2);
logger.info("Day/day cross-model similarity: " + simD);
logger.info("Night/night cross-model similarity: " + simN);
logger.info("Vec1 day/night similiraty: " + vec.similarity("day", "night"));
logger.info("Vec2 day/night similiraty: " + vec2.similarity("day", "night"));
// check if cross-model values are not the same
assertNotEquals(1.0, simD, 0.001);
assertNotEquals(1.0, simN, 0.001);
// check if cross-model values are still close to each other
assertTrue(simD > 0.70);
assertTrue(simN > 0.70);
modelFile.delete();
}
Aggregations