Search in sources :

Example 11 with InMemoryLookupCache

use of org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsVocabBuilding1.

/*
    @Test
    public void testWord2VecRunThroughVectors() throws Exception {
        ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
        File file = resource.getFile().getParentFile();
        LabelAwareSentenceIterator iter = LabelAwareUimaSentenceIterator.createWithPath(file.getAbsolutePath());
    
    
        TokenizerFactory t = new UimaTokenizerFactory();
    
    
        ParagraphVectors vec = new ParagraphVectors.Builder()
                .minWordFrequency(1).iterations(5).labels(Arrays.asList("label1", "deeple"))
                .layerSize(100)
                .stopWords(new ArrayList<String>())
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();
    
        assertEquals(new ArrayList<String>(), vec.getStopWords());
    
    
        vec.fit();
        double sim = vec.similarity("day","night");
        log.info("day/night similarity: " + sim);
        new File("cache.ser").delete();
    
    }
    */
/**
     * This test checks, how vocab is built using SentenceIterator provided, without labels.
     *
     * @throws Exception
     */
@Test
public void testParagraphVectorsVocabBuilding1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    //.getParentFile();
    File file = resource.getFile();
    //UimaSentenceIterator.createWithPath(file.getAbsolutePath());
    SentenceIterator iter = new BasicLineIterator(file);
    int numberOfLines = 0;
    while (iter.hasNext()) {
        iter.nextSentence();
        numberOfLines++;
    }
    iter.reset();
    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    // LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).layerSize(100).windowSize(5).iterate(iter).vocabCache(cache).tokenizerFactory(t).build();
    vec.buildVocab();
    LabelsSource source = vec.getLabelsSource();
    //VocabCache cache = vec.getVocab();
    log.info("Number of lines in corpus: " + numberOfLines);
    assertEquals(numberOfLines, source.getLabels().size());
    assertEquals(97162, source.getLabels().size());
    assertNotEquals(null, cache);
    assertEquals(97406, cache.numWords());
    // proper number of words for minWordsFrequency = 1 is 244
    assertEquals(244, cache.numWords() - source.getLabels().size());
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) Test(org.junit.Test)

Aggregations

InMemoryLookupCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)11 Test (org.junit.Test)7 INDArray (org.nd4j.linalg.api.ndarray.INDArray)6 File (java.io.File)4 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)4 ClassPathResource (org.datavec.api.util.ClassPathResource)3 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)3 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)3 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)3 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)3 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)3 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)3 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)3 WeightLookupTable (org.deeplearning4j.models.embeddings.WeightLookupTable)2 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)2 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)2 Ignore (org.junit.Ignore)2 FileOutputStream (java.io.FileOutputStream)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 StaticWord2Vec (org.deeplearning4j.models.word2vec.StaticWord2Vec)1