Search in sources :

Example 26 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsOverExistingWordVectorsModel.

/*
        In this test we'll build w2v model, and will use it's vocab and weights for ParagraphVectors.
        there's no need in this test within travis, use it manually only for problems detection
    */
@Test
public void testParagraphVectorsOverExistingWordVectorsModel() throws Exception {
    // we build w2v from multiple sources, to cover everything
    ClassPathResource resource_sentences = new ClassPathResource("/big/raw_sentences.txt");
    ClassPathResource resource_mixed = new ClassPathResource("/paravec");
    SentenceIterator iter = new AggregatingSentenceIterator.Builder().addSentenceIterator(new BasicLineIterator(resource_sentences.getFile())).addSentenceIterator(new FileSentenceIterator(resource_mixed.getFile())).build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(3).learningRate(0.025).layerSize(150).minLearningRate(0.001).elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    wordVectors.fit();
    VocabWord day_A = wordVectors.getVocab().tokenFor("day");
    INDArray vector_day1 = wordVectors.getWordVectorMatrix("day").dup();
    // At this moment we have ready w2v model. It's time to use it for ParagraphVectors
    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
    // documents from this iterator will be used for classification
    FileLabelAwareIterator unlabeledIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/unlabeled").getFile()).build();
    // we're building classifier now, with pre-built w2v model passed in
    ParagraphVectors paragraphVectors = new ParagraphVectors.Builder().iterate(labelAwareIterator).learningRate(0.025).minLearningRate(0.001).iterations(5).epochs(1).layerSize(150).tokenizerFactory(t).sequenceLearningAlgorithm(new DBOW<VocabWord>()).useHierarchicSoftmax(true).trainWordVectors(false).useExistingWordVectors(wordVectors).build();
    paragraphVectors.fit();
    VocabWord day_B = paragraphVectors.getVocab().tokenFor("day");
    assertEquals(day_A.getIndex(), day_B.getIndex());
    /*
        double similarityD = wordVectors.similarity("day", "night");
        log.info("day/night similarity: " + similarityD);
        assertTrue(similarityD > 0.5d);
        */
    INDArray vector_day2 = paragraphVectors.getWordVectorMatrix("day").dup();
    double crossDay = arraysSimilarity(vector_day1, vector_day2);
    log.info("Day1: " + vector_day1);
    log.info("Day2: " + vector_day2);
    log.info("Cross-Day similarity: " + crossDay);
    log.info("Cross-Day similiarity 2: " + Transforms.cosineSim(vector_day1, vector_day2));
    assertTrue(crossDay > 0.9d);
    /**
         *
         * Here we're checking cross-vocabulary equality
         *
         */
    /*
        Random rnd = new Random();
        VocabCache<VocabWord> cacheP = paragraphVectors.getVocab();
        VocabCache<VocabWord> cacheW = wordVectors.getVocab();
        for (int x = 0; x < 1000; x++) {
            int idx = rnd.nextInt(cacheW.numWords());
        
            String wordW = cacheW.wordAtIndex(idx);
            String wordP = cacheP.wordAtIndex(idx);
        
            assertEquals(wordW, wordP);
        
            INDArray arrayW = wordVectors.getWordVectorMatrix(wordW);
            INDArray arrayP = paragraphVectors.getWordVectorMatrix(wordP);
        
            double simWP = Transforms.cosineSim(arrayW, arrayP);
            assertTrue(simWP >= 0.9);
        }
        */
    log.info("Zfinance: " + paragraphVectors.getWordVectorMatrix("Zfinance"));
    log.info("Zhealth: " + paragraphVectors.getWordVectorMatrix("Zhealth"));
    log.info("Zscience: " + paragraphVectors.getWordVectorMatrix("Zscience"));
    LabelledDocument document = unlabeledIterator.nextDocument();
    log.info("Results for document '" + document.getLabel() + "'");
    List<String> results = new ArrayList<>(paragraphVectors.predictSeveral(document, 3));
    for (String result : results) {
        double sim = paragraphVectors.similarityToLabel(document, result);
        log.info("Similarity to [" + result + "] is [" + sim + "]");
    }
    String topPrediction = paragraphVectors.predict(document);
    assertEquals("Zfinance", topPrediction);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) SkipGram(org.deeplearning4j.models.embeddings.learning.impl.elements.SkipGram) FileLabelAwareIterator(org.deeplearning4j.text.documentiterator.FileLabelAwareIterator) ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) LabelledDocument(org.deeplearning4j.text.documentiterator.LabelledDocument) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) Test(org.junit.Test)

Example 27 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class InMemoryLookupTableTest method testConsumeOnNonEqualVocabs.

@Test
public void testConsumeOnNonEqualVocabs() throws Exception {
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();
    ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
    BasicLineIterator underlyingIterator = new BasicLineIterator(resource.getFile());
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();
    vocabConstructor.buildJointVocabulary(false, true);
    assertEquals(244, cacheSource.numWords());
    InMemoryLookupTable<VocabWord> mem1 = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100).cache(cacheSource).build();
    mem1.resetWeights(true);
    AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();
    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
    transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();
    sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();
    vocabTransfer.buildMergedVocabulary(cacheSource, true);
    // those +3 go for 3 additional entries in target VocabCache: labels
    assertEquals(cacheSource.numWords() + 3, cacheTarget.numWords());
    InMemoryLookupTable<VocabWord> mem2 = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100).cache(cacheTarget).seed(18).build();
    mem2.resetWeights(true);
    assertNotEquals(mem1.vector("day"), mem2.vector("day"));
    mem2.consume(mem1);
    assertEquals(mem1.vector("day"), mem2.vector("day"));
    assertTrue(mem1.syn0.rows() < mem2.syn0.rows());
    assertEquals(mem1.syn0.rows() + 3, mem2.syn0.rows());
}
Also used : TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabConstructor(org.deeplearning4j.models.word2vec.wordstore.VocabConstructor) FileLabelAwareIterator(org.deeplearning4j.text.documentiterator.FileLabelAwareIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) Test(org.junit.Test)

Example 28 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class InMemoryLookupTableTest method testConsumeOnEqualVocabs.

@Test
public void testConsumeOnEqualVocabs() throws Exception {
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();
    ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
    BasicLineIterator underlyingIterator = new BasicLineIterator(resource.getFile());
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();
    vocabConstructor.buildJointVocabulary(false, true);
    assertEquals(244, cacheSource.numWords());
    InMemoryLookupTable<VocabWord> mem1 = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100).cache(cacheSource).seed(17).build();
    mem1.resetWeights(true);
    InMemoryLookupTable<VocabWord> mem2 = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100).cache(cacheSource).seed(15).build();
    mem2.resetWeights(true);
    assertNotEquals(mem1.vector("day"), mem2.vector("day"));
    mem2.consume(mem1);
    assertEquals(mem1.vector("day"), mem2.vector("day"));
}
Also used : TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabConstructor(org.deeplearning4j.models.word2vec.wordstore.VocabConstructor) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) Test(org.junit.Test)

Example 29 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class AbstractCoOccurrencesTest method testFit1.

@Test
public void testFit1() throws Exception {
    ClassPathResource resource = new ClassPathResource("other/oneline.txt");
    File file = resource.getFile();
    AbstractCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
    BasicLineIterator underlyingIterator = new BasicLineIterator(file);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(vocabCache).build();
    constructor.buildJointVocabulary(false, true);
    AbstractCoOccurrences<VocabWord> coOccurrences = new AbstractCoOccurrences.Builder<VocabWord>().iterate(sequenceIterator).vocabCache(vocabCache).symmetric(false).windowSize(15).build();
    coOccurrences.fit();
    //List<Pair<VocabWord, VocabWord>> list = coOccurrences.i();
    Iterator<Pair<Pair<VocabWord, VocabWord>, Double>> iterator = coOccurrences.iterator();
    assertNotEquals(null, iterator);
    int cnt = 0;
    List<Pair<VocabWord, VocabWord>> list = new ArrayList<>();
    while (iterator.hasNext()) {
        Pair<Pair<VocabWord, VocabWord>, Double> pair = iterator.next();
        list.add(pair.getFirst());
        cnt++;
    }
    log.info("CoOccurrences: " + list);
    assertEquals(16, list.size());
    assertEquals(16, cnt);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) Pair(org.deeplearning4j.berkeley.Pair) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabConstructor(org.deeplearning4j.models.word2vec.wordstore.VocabConstructor) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) ClassPathResource(org.datavec.api.util.ClassPathResource) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) File(java.io.File) Test(org.junit.Test)

Example 30 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class ParallelTransformerIteratorTest method hasNext.

@Test
public void hasNext() throws Exception {
    SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true).tokenizerFactory(factory).build();
    Iterator<Sequence<VocabWord>> iter = transformer.iterator();
    int cnt = 0;
    Sequence<VocabWord> sequence = null;
    while (iter.hasNext()) {
        sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }
    //   log.info("Last element: {}", sequence.asLabels());
    assertEquals(97162, cnt);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) Sequence(org.deeplearning4j.models.sequencevectors.sequence.Sequence) PrefetchingSentenceIterator(org.deeplearning4j.text.sentenceiterator.PrefetchingSentenceIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) MutipleEpochsSentenceIterator(org.deeplearning4j.text.sentenceiterator.MutipleEpochsSentenceIterator) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Aggregations

BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)36 Test (org.junit.Test)34 ClassPathResource (org.datavec.api.util.ClassPathResource)27 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)24 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)24 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)23 File (java.io.File)22 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)19 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)11 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)7 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)7 ArrayList (java.util.ArrayList)6 BasicModelUtils (org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils)5 LabelsSource (org.deeplearning4j.text.documentiterator.LabelsSource)5