Search in sources :

Example 1 with LabelsSource

use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsDM.

@Test
public void testParagraphVectorsDM() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(2).seed(119).epochs(3).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).useHierarchicSoftmax(true).sampling(0).workers(1).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    // testing DM inference now
    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) DM(org.deeplearning4j.models.embeddings.learning.impl.sequence.DM) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) Test(org.junit.Test)

Example 2 with LabelsSource

use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsWithWordVectorsModelling1.

@Test
public void testParagraphVectorsWithWordVectorsModelling1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    //        InMemoryLookupCache cache = new InMemoryLookupCache(false);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(3).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).sampling(0).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    /*
            We have few lines that contain pretty close words invloved.
            These sentences should be pretty close to each other in vector space
         */
    // line 3721: This is my way .
    // line 6348: This is my case .
    // line 9836: This is my house .
    // line 12493: This is my world .
    // line 16393: This is my work .
    // this is special sentence, that has nothing common with previous sentences
    // line 9853: We now have one .
    assertTrue(vec.hasWord("DOC_3720"));
    double similarityD = vec.similarity("day", "night");
    log.info("day/night similarity: " + similarityD);
    double similarityW = vec.similarity("way", "work");
    log.info("way/work similarity: " + similarityW);
    double similarityH = vec.similarity("house", "world");
    log.info("house/world similarity: " + similarityH);
    double similarityC = vec.similarity("case", "way");
    log.info("case/way similarity: " + similarityC);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.7d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //        assertTrue(similarity2 > 0.7d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity2 > 0.7d);
    // likelihood in this case should be significantly lower
    // however, since corpus is small, and weight initialization is random-based, sometimes this test CAN fail
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    double sim119 = vec.similarityToLabel("This is my case .", "DOC_6347");
    double sim120 = vec.similarityToLabel("This is my case .", "DOC_3720");
    log.info("1/2: " + sim119 + "/" + sim120);
//assertEquals(similarity3, sim119, 0.001);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) Test(org.junit.Test)

Example 3 with LabelsSource

use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readParagraphVectorsFromText.

/**
     * Restores previously serialized ParagraphVectors model
     *
     * Deprecation note: Please, consider using readParagraphVectors() method instead
     *
     * @param stream InputStream that contains previously serialized model
     * @return
     */
@Deprecated
public static ParagraphVectors readParagraphVectorsFromText(@NonNull InputStream stream) {
    try {
        BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        ArrayList<String> labels = new ArrayList<>();
        ArrayList<INDArray> arrays = new ArrayList<>();
        VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
        String line = "";
        while ((line = reader.readLine()) != null) {
            String[] split = line.split(" ");
            split[1] = split[1].replaceAll(whitespaceReplacement, " ");
            VocabWord word = new VocabWord(1.0, split[1]);
            if (split[0].equals("L")) {
                // we have label element here
                word.setSpecial(true);
                word.markAsLabel(true);
                labels.add(word.getLabel());
            } else if (split[0].equals("E")) {
                // we have usual element, aka word here
                word.setSpecial(false);
                word.markAsLabel(false);
            } else
                throw new IllegalStateException("Source stream doesn't looks like ParagraphVectors serialized model");
            // this particular line is just for backward compatibility with InMemoryLookupCache
            word.setIndex(vocabCache.numWords());
            vocabCache.addToken(word);
            vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
            // backward compatibility code
            vocabCache.putVocabWord(word.getLabel());
            float[] vector = new float[split.length - 2];
            for (int i = 2; i < split.length; i++) {
                vector[i - 2] = Float.parseFloat(split[i]);
            }
            INDArray row = Nd4j.create(vector);
            arrays.add(row);
        }
        // now we create syn0 matrix, using previously fetched rows
        /*INDArray syn = Nd4j.create(new int[]{arrays.size(), arrays.get(0).columns()});
            for (int i = 0; i < syn.rows(); i++) {
                syn.putRow(i, arrays.get(i));
            }*/
        INDArray syn = Nd4j.vstack(arrays);
        InMemoryLookupTable<VocabWord> lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(arrays.get(0).columns()).useAdaGrad(false).cache(vocabCache).build();
        Nd4j.clearNans(syn);
        lookupTable.setSyn0(syn);
        LabelsSource source = new LabelsSource(labels);
        ParagraphVectors vectors = new ParagraphVectors.Builder().labelsSource(source).vocabCache(vocabCache).lookupTable(lookupTable).modelUtils(new BasicModelUtils<VocabWord>()).build();
        try {
            reader.close();
        } catch (Exception e) {
        }
        vectors.extractLabels();
        return vectors;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ParagraphVectors(org.deeplearning4j.models.paragraphvectors.ParagraphVectors) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) BasicModelUtils(org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource)

Example 4 with LabelsSource

use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsDBOW.

@Test
public void testParagraphVectorsDBOW() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(2).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    // testing DM inference now
    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    INDArray inferredC1 = vec.inferVector("This is my day");
    INDArray inferredD1 = vec.inferVector("This is my night");
    log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
    log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));
    assertNotEquals(inferredA1, inferredC1);
    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
    double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());
    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
    log.info("Cos A/C: {}", cosAC1);
    log.info("Cos C/D: {}", cosCD1);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) DBOW(org.deeplearning4j.models.embeddings.learning.impl.sequence.DBOW) File(java.io.File) Test(org.junit.Test)

Example 5 with LabelsSource

use of org.deeplearning4j.text.documentiterator.LabelsSource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsVocabBuilding1.

/*
    @Test
    public void testWord2VecRunThroughVectors() throws Exception {
        ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
        File file = resource.getFile().getParentFile();
        LabelAwareSentenceIterator iter = LabelAwareUimaSentenceIterator.createWithPath(file.getAbsolutePath());
    
    
        TokenizerFactory t = new UimaTokenizerFactory();
    
    
        ParagraphVectors vec = new ParagraphVectors.Builder()
                .minWordFrequency(1).iterations(5).labels(Arrays.asList("label1", "deeple"))
                .layerSize(100)
                .stopWords(new ArrayList<String>())
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();
    
        assertEquals(new ArrayList<String>(), vec.getStopWords());
    
    
        vec.fit();
        double sim = vec.similarity("day","night");
        log.info("day/night similarity: " + sim);
        new File("cache.ser").delete();
    
    }
    */
/**
     * This test checks, how vocab is built using SentenceIterator provided, without labels.
     *
     * @throws Exception
     */
@Test
public void testParagraphVectorsVocabBuilding1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    //.getParentFile();
    File file = resource.getFile();
    //UimaSentenceIterator.createWithPath(file.getAbsolutePath());
    SentenceIterator iter = new BasicLineIterator(file);
    int numberOfLines = 0;
    while (iter.hasNext()) {
        iter.nextSentence();
        numberOfLines++;
    }
    iter.reset();
    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    // LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).layerSize(100).windowSize(5).iterate(iter).vocabCache(cache).tokenizerFactory(t).build();
    vec.buildVocab();
    LabelsSource source = vec.getLabelsSource();
    //VocabCache cache = vec.getVocab();
    log.info("Number of lines in corpus: " + numberOfLines);
    assertEquals(numberOfLines, source.getLabels().size());
    assertEquals(97162, source.getLabels().size());
    assertNotEquals(null, cache);
    assertEquals(97406, cache.numWords());
    // proper number of words for minWordsFrequency = 1 is 244
    assertEquals(244, cache.numWords() - source.getLabels().size());
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) Test(org.junit.Test)

Aggregations

LabelsSource (org.deeplearning4j.text.documentiterator.LabelsSource)6 File (java.io.File)5 ClassPathResource (org.datavec.api.util.ClassPathResource)5 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)5 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)5 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)5 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)5 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)5 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)5 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)5 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)5 Test (org.junit.Test)5 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)4 INDArray (org.nd4j.linalg.api.ndarray.INDArray)4 ArrayList (java.util.ArrayList)2 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)2 DL4JInvalidInputException (org.deeplearning4j.exception.DL4JInvalidInputException)1 DBOW (org.deeplearning4j.models.embeddings.learning.impl.sequence.DBOW)1 DM (org.deeplearning4j.models.embeddings.learning.impl.sequence.DM)1 BasicModelUtils (org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils)1