Search in sources :

Example 21 with TokenizerFactory

use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.

the class Word2VecTest method testSparkW2VonBiggerCorpus.

@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest").set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g").set("spark.executor.memory", "8g");
    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    // Path of data part-00000
    //String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());
    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).useUnknown(true).build();
    word2Vec.train(corpus);
    sc.stop();
    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) LowCasePreProcessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.LowCasePreProcessor) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) ClassPathResource(org.datavec.api.util.ClassPathResource) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 22 with TokenizerFactory

use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.

the class Word2VecTest method testConcepts.

@Test
public void testConcepts() throws Exception {
    // These are all default values for word2vec
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest");
    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    // Path of data part-00000
    String dataPath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
    //        dataPath = "/ext/Temp/part-00000";
    //        String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).stopWords(Arrays.asList("three")).useUnknown(true).build();
    word2Vec.train(corpus);
    //word2Vec.setModelUtils(new FlatModelUtils());
    System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK"));
    InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable();
    double sim = word2Vec.similarity("day", "night");
    System.out.println("day/night similarity: " + sim);
    /*
        System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce"));
        System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro"));
        
        Collection<String> portu = word2Vec.wordsNearest("carro", 10);
        printWords("carro", portu, word2Vec);
        
        portu = word2Vec.wordsNearest("davi", 10);
        printWords("davi", portu, word2Vec);
        
        System.out.println("---------------------------------------");
        */
    Collection<String> words = word2Vec.wordsNearest("day", 10);
    printWords("day", words, word2Vec);
    assertTrue(words.contains("night"));
    assertTrue(words.contains("week"));
    assertTrue(words.contains("year"));
    sim = word2Vec.similarity("two", "four");
    System.out.println("two/four similarity: " + sim);
    words = word2Vec.wordsNearest("two", 10);
    printWords("two", words, word2Vec);
    // three should be absent due to stopWords
    assertFalse(words.contains("three"));
    assertTrue(words.contains("five"));
    assertTrue(words.contains("four"));
    sc.stop();
    // test serialization
    File tempFile = File.createTempFile("temp", "tmp");
    tempFile.deleteOnExit();
    int idx1 = word2Vec.vocab().wordFor("day").getIndex();
    INDArray array1 = word2Vec.getWordVectorMatrix("day").dup();
    VocabWord word1 = word2Vec.vocab().elementAtIndex(0);
    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile);
    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
    VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0);
    VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it");
    int idx2 = vectors.vocab().wordFor("day").getIndex();
    INDArray array2 = vectors.getWordVectorMatrix("day").dup();
    System.out.println("word 'i': " + word2);
    System.out.println("word 'it': " + wordIT);
    assertEquals(idx1, idx2);
    assertEquals(word1, word2);
    assertEquals(array1, array2);
}
Also used : TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ClassPathResource(org.datavec.api.util.ClassPathResource) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) SparkConf(org.apache.spark.SparkConf) File(java.io.File) Test(org.junit.Test)

Example 23 with TokenizerFactory

use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testFullModelSerialization.

@Test
public void testFullModelSerialization() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0).cache(cache).lr(0.025f).build();
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).vocabCache(cache).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    //logger.info("Original word 0: " + cache.wordFor(cache.wordAtIndex(0)));
    //logger.info("Closest Words:");
    Collection<String> lst = vec.wordsNearest("day", 10);
    System.out.println(lst);
    WordVectorSerializer.writeFullModel(vec, "tempModel.txt");
    File modelFile = new File("tempModel.txt");
    modelFile.deleteOnExit();
    assertTrue(modelFile.exists());
    assertTrue(modelFile.length() > 0);
    Word2Vec vec2 = WordVectorSerializer.loadFullModel("tempModel.txt");
    assertNotEquals(null, vec2);
    assertEquals(vec.getConfiguration(), vec2.getConfiguration());
    //logger.info("Source ExpTable: " + ArrayUtils.toString(((InMemoryLookupTable) table).getExpTable()));
    //logger.info("Dest  ExpTable: " + ArrayUtils.toString(((InMemoryLookupTable)  vec2.getLookupTable()).getExpTable()));
    assertTrue(ArrayUtils.isEquals(((InMemoryLookupTable) table).getExpTable(), ((InMemoryLookupTable) vec2.getLookupTable()).getExpTable()));
    InMemoryLookupTable restoredTable = (InMemoryLookupTable) vec2.lookupTable();
    /*
        logger.info("Restored word 1: " + restoredTable.getVocab().wordFor(restoredTable.getVocab().wordAtIndex(1)));
        logger.info("Restored word 'it': " + restoredTable.getVocab().wordFor("it"));
        logger.info("Original word 1: " + cache.wordFor(cache.wordAtIndex(1)));
        logger.info("Original word 'i': " + cache.wordFor("i"));
        logger.info("Original word 0: " + cache.wordFor(cache.wordAtIndex(0)));
        logger.info("Restored word 0: " + restoredTable.getVocab().wordFor(restoredTable.getVocab().wordAtIndex(0)));
        */
    assertEquals(cache.wordAtIndex(1), restoredTable.getVocab().wordAtIndex(1));
    assertEquals(cache.wordAtIndex(7), restoredTable.getVocab().wordAtIndex(7));
    assertEquals(cache.wordAtIndex(15), restoredTable.getVocab().wordAtIndex(15));
    /*
            these tests needed only to make sure INDArray equality is working properly
         */
    double[] array1 = new double[] { 0.323232325, 0.65756575, 0.12315, 0.12312315, 0.1232135, 0.12312315, 0.4343423425, 0.15 };
    double[] array2 = new double[] { 0.423232325, 0.25756575, 0.12375, 0.12311315, 0.1232035, 0.12318315, 0.4343493425, 0.25 };
    assertNotEquals(Nd4j.create(array1), Nd4j.create(array2));
    assertEquals(Nd4j.create(array1), Nd4j.create(array1));
    INDArray rSyn0_1 = restoredTable.getSyn0().slice(1);
    INDArray oSyn0_1 = ((InMemoryLookupTable) table).getSyn0().slice(1);
    //logger.info("Restored syn0: " + rSyn0_1);
    //logger.info("Original syn0: " + oSyn0_1);
    assertEquals(oSyn0_1, rSyn0_1);
    // just checking $^###! syn0/syn1 order
    int cnt = 0;
    for (VocabWord word : cache.vocabWords()) {
        INDArray rSyn0 = restoredTable.getSyn0().slice(word.getIndex());
        INDArray oSyn0 = ((InMemoryLookupTable) table).getSyn0().slice(word.getIndex());
        assertEquals(rSyn0, oSyn0);
        assertEquals(1.0, arraysSimilarity(rSyn0, oSyn0), 0.001);
        INDArray rSyn1 = restoredTable.getSyn1().slice(word.getIndex());
        INDArray oSyn1 = ((InMemoryLookupTable) table).getSyn1().slice(word.getIndex());
        assertEquals(rSyn1, oSyn1);
        if (arraysSimilarity(rSyn1, oSyn1) < 0.98) {
        //   logger.info("Restored syn1: " + rSyn1);
        //   logger.info("Original  syn1: " + oSyn1);
        }
        // we exclude word 222 since it has syn1 full of zeroes
        if (cnt != 222)
            assertEquals(1.0, arraysSimilarity(rSyn1, oSyn1), 0.001);
        if (((InMemoryLookupTable) table).getSyn1Neg() != null) {
            INDArray rSyn1Neg = restoredTable.getSyn1Neg().slice(word.getIndex());
            INDArray oSyn1Neg = ((InMemoryLookupTable) table).getSyn1Neg().slice(word.getIndex());
            assertEquals(rSyn1Neg, oSyn1Neg);
        //                assertEquals(1.0, arraysSimilarity(rSyn1Neg, oSyn1Neg), 0.001);
        }
        assertEquals(word.getHistoricalGradient(), restoredTable.getVocab().wordFor(word.getWord()).getHistoricalGradient());
        cnt++;
    }
    // at this moment we can assume that whole model is transferred, and we can call fit over new model
    //        iter.reset();
    iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    vec2.setTokenizerFactory(t);
    vec2.setSentenceIterator(iter);
    vec2.fit();
    INDArray day1 = vec.getWordVectorMatrix("day");
    INDArray day2 = vec2.getWordVectorMatrix("day");
    INDArray night1 = vec.getWordVectorMatrix("night");
    INDArray night2 = vec2.getWordVectorMatrix("night");
    double simD = arraysSimilarity(day1, day2);
    double simN = arraysSimilarity(night1, night2);
    logger.info("Vec1 day: " + day1);
    logger.info("Vec2 day: " + day2);
    logger.info("Vec1 night: " + night1);
    logger.info("Vec2 night: " + night2);
    logger.info("Day/day cross-model similarity: " + simD);
    logger.info("Night/night cross-model similarity: " + simN);
    logger.info("Vec1 day/night similiraty: " + vec.similarity("day", "night"));
    logger.info("Vec2 day/night similiraty: " + vec2.similarity("day", "night"));
    // check if cross-model values are not the same
    assertNotEquals(1.0, simD, 0.001);
    assertNotEquals(1.0, simN, 0.001);
    // check if cross-model values are still close to each other
    assertTrue(simD > 0.70);
    assertTrue(simN > 0.70);
    modelFile.delete();
}
Also used : TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) WeightLookupTable(org.deeplearning4j.models.embeddings.WeightLookupTable) File(java.io.File) Test(org.junit.Test)

Example 24 with TokenizerFactory

use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testOutputStream.

@Test
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0).cache(cache).lr(0.025f).build();
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).vocabCache(cache).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    INDArray day1 = vec.getWordVectorMatrix("day");
    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));
    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);
    INDArray day2 = vec2.getWordVectorMatrix("day");
    assertEquals(day1, day2);
    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();
    WordVectorSerializer.writeWord2VecModel(vec, tempFile);
    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) FileOutputStream(java.io.FileOutputStream) WeightLookupTable(org.deeplearning4j.models.embeddings.WeightLookupTable) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Test(org.junit.Test)

Example 25 with TokenizerFactory

use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.

the class Word2VecTests method testWord2VecGoogleModelUptraining.

@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
    vec.fit();
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) File(java.io.File) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)47 Test (org.junit.Test)42 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)40 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)29 File (java.io.File)28 ClassPathResource (org.datavec.api.util.ClassPathResource)28 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)24 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)22 INDArray (org.nd4j.linalg.api.ndarray.INDArray)20 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)19 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)12 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)11 ArrayList (java.util.ArrayList)10 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)8 Ignore (org.junit.Ignore)8 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)6 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)6 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)6