Search in sources :

Example 56 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testZipFunction2.

@Test
public void testZipFunction2() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
    List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();
    List<VocabWord> vocabWordsList1 = lst.get(0)._1();
    Long cumSumSize1 = lst.get(0)._2();
    assertEquals(6, vocabWordsList1.size());
    assertEquals(vocabWordsList1.get(0).getWord(), "this");
    assertEquals(vocabWordsList1.get(1).getWord(), "is");
    assertEquals(vocabWordsList1.get(2).getWord(), "a");
    assertEquals(vocabWordsList1.get(3).getWord(), "strange");
    assertEquals(vocabWordsList1.get(4).getWord(), "strange");
    assertEquals(vocabWordsList1.get(5).getWord(), "world");
    assertEquals(cumSumSize1, 6L, 0);
    List<VocabWord> vocabWordsList2 = lst.get(1)._1();
    Long cumSumSize2 = lst.get(1)._2();
    assertEquals(vocabWordsList2.size(), 3);
    assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
    assertEquals(vocabWordsList2.get(1).getWord(), "are");
    assertEquals(vocabWordsList2.get(2).getWord(), "red");
    assertEquals(cumSumSize2, 9L, 0);
    sc.stop();
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) AtomicLong(java.util.concurrent.atomic.AtomicLong) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong) CountCumSum(org.deeplearning4j.spark.text.functions.CountCumSum) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.junit.Test)

Example 57 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testHuffman.

@Test
public void testHuffman() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    huffman.applyIndexes(vocabCache);
    Collection<VocabWord> vocabWords = vocabCache.vocabWords();
    System.out.println("Huffman Test:");
    for (VocabWord vocabWord : vocabWords) {
        System.out.println("Word: " + vocabWord);
        System.out.println(vocabWord.getCodes());
        System.out.println(vocabWord.getPoints());
    }
    sc.stop();
}
Also used : Huffman(org.deeplearning4j.models.word2vec.Huffman) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) Test(org.junit.Test)

Example 58 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testBuildVocabCache.

@Test
public void testBuildVocabCache() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    assertTrue(vocabCache != null);
    log.info("VocabWords: " + vocabCache.words());
    assertEquals(5, vocabCache.numWords());
    VocabWord redVocab = vocabCache.tokenFor("red");
    VocabWord flowerVocab = vocabCache.tokenFor("flowers");
    VocabWord worldVocab = vocabCache.tokenFor("world");
    VocabWord strangeVocab = vocabCache.tokenFor("strange");
    log.info("Red word: " + redVocab);
    log.info("Flower word: " + flowerVocab);
    log.info("World word: " + worldVocab);
    log.info("Strange word: " + strangeVocab);
    assertEquals(redVocab.getWord(), "red");
    assertEquals(redVocab.getElementFrequency(), 1, 0);
    assertEquals(flowerVocab.getWord(), "flowers");
    assertEquals(flowerVocab.getElementFrequency(), 1, 0);
    assertEquals(worldVocab.getWord(), "world");
    assertEquals(worldVocab.getElementFrequency(), 1, 0);
    assertEquals(strangeVocab.getWord(), "strange");
    assertEquals(strangeVocab.getElementFrequency(), 2, 0);
    sc.stop();
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) Test(org.junit.Test)

Example 59 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testParaVecSerialization1.

@Test
public void testParaVecSerialization1() throws Exception {
    VectorsConfiguration configuration = new VectorsConfiguration();
    configuration.setIterations(14123);
    configuration.setLayersSize(156);
    INDArray syn0 = Nd4j.rand(100, configuration.getLayersSize());
    INDArray syn1 = Nd4j.rand(100, configuration.getLayersSize());
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    for (int i = 0; i < 100; i++) {
        VocabWord word = new VocabWord((float) i, "word_" + i);
        List<Integer> points = new ArrayList<>();
        List<Byte> codes = new ArrayList<>();
        int num = org.apache.commons.lang3.RandomUtils.nextInt(1, 20);
        for (int x = 0; x < num; x++) {
            points.add(org.apache.commons.lang3.RandomUtils.nextInt(1, 100000));
            codes.add(org.apache.commons.lang3.RandomUtils.nextBytes(10)[0]);
        }
        if (RandomUtils.nextInt(10) < 3) {
            word.markAsLabel(true);
        }
        word.setIndex(i);
        word.setPoints(points);
        word.setCodes(codes);
        cache.addToken(word);
        cache.addWordToIndex(i, word.getLabel());
    }
    InMemoryLookupTable<VocabWord> lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(configuration.getLayersSize()).cache(cache).build();
    lookupTable.setSyn0(syn0);
    lookupTable.setSyn1(syn1);
    ParagraphVectors originalVectors = new ParagraphVectors.Builder(configuration).vocabCache(cache).lookupTable(lookupTable).build();
    File tempFile = File.createTempFile("paravec", "tests");
    tempFile.deleteOnExit();
    WordVectorSerializer.writeParagraphVectors(originalVectors, tempFile);
    ParagraphVectors restoredVectors = WordVectorSerializer.readParagraphVectors(tempFile);
    InMemoryLookupTable<VocabWord> restoredLookupTable = (InMemoryLookupTable<VocabWord>) restoredVectors.getLookupTable();
    AbstractCache<VocabWord> restoredVocab = (AbstractCache<VocabWord>) restoredVectors.getVocab();
    assertEquals(restoredLookupTable.getSyn0(), lookupTable.getSyn0());
    assertEquals(restoredLookupTable.getSyn1(), lookupTable.getSyn1());
    for (int i = 0; i < cache.numWords(); i++) {
        assertEquals(cache.elementAtIndex(i).isLabel(), restoredVocab.elementAtIndex(i).isLabel());
        assertEquals(cache.wordAtIndex(i), restoredVocab.wordAtIndex(i));
        assertEquals(cache.elementAtIndex(i).getElementFrequency(), restoredVocab.elementAtIndex(i).getElementFrequency(), 0.1f);
        List<Integer> originalPoints = cache.elementAtIndex(i).getPoints();
        List<Integer> restoredPoints = restoredVocab.elementAtIndex(i).getPoints();
        assertEquals(originalPoints.size(), restoredPoints.size());
        for (int x = 0; x < originalPoints.size(); x++) {
            assertEquals(originalPoints.get(x), restoredPoints.get(x));
        }
        List<Byte> originalCodes = cache.elementAtIndex(i).getCodes();
        List<Byte> restoredCodes = restoredVocab.elementAtIndex(i).getCodes();
        assertEquals(originalCodes.size(), restoredCodes.size());
        for (int x = 0; x < originalCodes.size(); x++) {
            assertEquals(originalCodes.get(x), restoredCodes.get(x));
        }
    }
}
Also used : VectorsConfiguration(org.deeplearning4j.models.embeddings.loader.VectorsConfiguration) ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ParagraphVectors(org.deeplearning4j.models.paragraphvectors.ParagraphVectors) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) File(java.io.File) Test(org.junit.Test)

Example 60 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testFullModelSerialization.

@Test
public void testFullModelSerialization() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0).cache(cache).lr(0.025f).build();
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).vocabCache(cache).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    //logger.info("Original word 0: " + cache.wordFor(cache.wordAtIndex(0)));
    //logger.info("Closest Words:");
    Collection<String> lst = vec.wordsNearest("day", 10);
    System.out.println(lst);
    WordVectorSerializer.writeFullModel(vec, "tempModel.txt");
    File modelFile = new File("tempModel.txt");
    modelFile.deleteOnExit();
    assertTrue(modelFile.exists());
    assertTrue(modelFile.length() > 0);
    Word2Vec vec2 = WordVectorSerializer.loadFullModel("tempModel.txt");
    assertNotEquals(null, vec2);
    assertEquals(vec.getConfiguration(), vec2.getConfiguration());
    //logger.info("Source ExpTable: " + ArrayUtils.toString(((InMemoryLookupTable) table).getExpTable()));
    //logger.info("Dest  ExpTable: " + ArrayUtils.toString(((InMemoryLookupTable)  vec2.getLookupTable()).getExpTable()));
    assertTrue(ArrayUtils.isEquals(((InMemoryLookupTable) table).getExpTable(), ((InMemoryLookupTable) vec2.getLookupTable()).getExpTable()));
    InMemoryLookupTable restoredTable = (InMemoryLookupTable) vec2.lookupTable();
    /*
        logger.info("Restored word 1: " + restoredTable.getVocab().wordFor(restoredTable.getVocab().wordAtIndex(1)));
        logger.info("Restored word 'it': " + restoredTable.getVocab().wordFor("it"));
        logger.info("Original word 1: " + cache.wordFor(cache.wordAtIndex(1)));
        logger.info("Original word 'i': " + cache.wordFor("i"));
        logger.info("Original word 0: " + cache.wordFor(cache.wordAtIndex(0)));
        logger.info("Restored word 0: " + restoredTable.getVocab().wordFor(restoredTable.getVocab().wordAtIndex(0)));
        */
    assertEquals(cache.wordAtIndex(1), restoredTable.getVocab().wordAtIndex(1));
    assertEquals(cache.wordAtIndex(7), restoredTable.getVocab().wordAtIndex(7));
    assertEquals(cache.wordAtIndex(15), restoredTable.getVocab().wordAtIndex(15));
    /*
            these tests needed only to make sure INDArray equality is working properly
         */
    double[] array1 = new double[] { 0.323232325, 0.65756575, 0.12315, 0.12312315, 0.1232135, 0.12312315, 0.4343423425, 0.15 };
    double[] array2 = new double[] { 0.423232325, 0.25756575, 0.12375, 0.12311315, 0.1232035, 0.12318315, 0.4343493425, 0.25 };
    assertNotEquals(Nd4j.create(array1), Nd4j.create(array2));
    assertEquals(Nd4j.create(array1), Nd4j.create(array1));
    INDArray rSyn0_1 = restoredTable.getSyn0().slice(1);
    INDArray oSyn0_1 = ((InMemoryLookupTable) table).getSyn0().slice(1);
    //logger.info("Restored syn0: " + rSyn0_1);
    //logger.info("Original syn0: " + oSyn0_1);
    assertEquals(oSyn0_1, rSyn0_1);
    // just checking $^###! syn0/syn1 order
    int cnt = 0;
    for (VocabWord word : cache.vocabWords()) {
        INDArray rSyn0 = restoredTable.getSyn0().slice(word.getIndex());
        INDArray oSyn0 = ((InMemoryLookupTable) table).getSyn0().slice(word.getIndex());
        assertEquals(rSyn0, oSyn0);
        assertEquals(1.0, arraysSimilarity(rSyn0, oSyn0), 0.001);
        INDArray rSyn1 = restoredTable.getSyn1().slice(word.getIndex());
        INDArray oSyn1 = ((InMemoryLookupTable) table).getSyn1().slice(word.getIndex());
        assertEquals(rSyn1, oSyn1);
        if (arraysSimilarity(rSyn1, oSyn1) < 0.98) {
        //   logger.info("Restored syn1: " + rSyn1);
        //   logger.info("Original  syn1: " + oSyn1);
        }
        // we exclude word 222 since it has syn1 full of zeroes
        if (cnt != 222)
            assertEquals(1.0, arraysSimilarity(rSyn1, oSyn1), 0.001);
        if (((InMemoryLookupTable) table).getSyn1Neg() != null) {
            INDArray rSyn1Neg = restoredTable.getSyn1Neg().slice(word.getIndex());
            INDArray oSyn1Neg = ((InMemoryLookupTable) table).getSyn1Neg().slice(word.getIndex());
            assertEquals(rSyn1Neg, oSyn1Neg);
        //                assertEquals(1.0, arraysSimilarity(rSyn1Neg, oSyn1Neg), 0.001);
        }
        assertEquals(word.getHistoricalGradient(), restoredTable.getVocab().wordFor(word.getWord()).getHistoricalGradient());
        cnt++;
    }
    // at this moment we can assume that whole model is transferred, and we can call fit over new model
    //        iter.reset();
    iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    vec2.setTokenizerFactory(t);
    vec2.setSentenceIterator(iter);
    vec2.fit();
    INDArray day1 = vec.getWordVectorMatrix("day");
    INDArray day2 = vec2.getWordVectorMatrix("day");
    INDArray night1 = vec.getWordVectorMatrix("night");
    INDArray night2 = vec2.getWordVectorMatrix("night");
    double simD = arraysSimilarity(day1, day2);
    double simN = arraysSimilarity(night1, night2);
    logger.info("Vec1 day: " + day1);
    logger.info("Vec2 day: " + day2);
    logger.info("Vec1 night: " + night1);
    logger.info("Vec2 night: " + night2);
    logger.info("Day/day cross-model similarity: " + simD);
    logger.info("Night/night cross-model similarity: " + simN);
    logger.info("Vec1 day/night similiraty: " + vec.similarity("day", "night"));
    logger.info("Vec2 day/night similiraty: " + vec2.similarity("day", "night"));
    // check if cross-model values are not the same
    assertNotEquals(1.0, simD, 0.001);
    assertNotEquals(1.0, simN, 0.001);
    // check if cross-model values are still close to each other
    assertTrue(simD > 0.70);
    assertTrue(simN > 0.70);
    modelFile.delete();
}
Also used : TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) WeightLookupTable(org.deeplearning4j.models.embeddings.WeightLookupTable) File(java.io.File) Test(org.junit.Test)

Aggregations

VocabWord (org.deeplearning4j.models.word2vec.VocabWord)110 Test (org.junit.Test)54 INDArray (org.nd4j.linalg.api.ndarray.INDArray)31 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)26 ClassPathResource (org.datavec.api.util.ClassPathResource)23 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)22 File (java.io.File)20 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)19 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)19 ArrayList (java.util.ArrayList)17 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)17 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)15 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)14 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)13 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)12 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)11 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)11 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)10