Search in sources :

Example 11 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class PrefetchingSentenceIteratorTest method testLoadedIterator1.

@Test
public void testLoadedIterator1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    BasicLineIterator iterator = new BasicLineIterator(file);
    PrefetchingSentenceIterator fetcher = new PrefetchingSentenceIterator.Builder(iterator).setFetchSize(1000).build();
    log.info("Phase 1 starting");
    int cnt = 0;
    while (fetcher.hasNext()) {
        String line = fetcher.nextSentence();
        // we'll imitate some workload in current thread by using ThreadSleep.
        // there's no need to keep it enabled forever, just uncomment next line if you're going to test this iterator.
        // otherwise this test will
        //    Thread.sleep(0, 10);
        cnt++;
        if (cnt % 10000 == 0)
            log.info("Line processed: " + cnt);
    }
}
Also used : File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 12 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class PrefetchingSentenceIteratorTest method testHasMoreLinesFile.

@Test
public void testHasMoreLinesFile() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    BasicLineIterator iterator = new BasicLineIterator(file);
    PrefetchingSentenceIterator fetcher = new PrefetchingSentenceIterator.Builder(iterator).setFetchSize(1000).build();
    log.info("Phase 1 starting");
    int cnt = 0;
    while (fetcher.hasNext()) {
        String line = fetcher.nextSentence();
        //            log.info(line);
        cnt++;
    }
    assertEquals(97162, cnt);
    log.info("Phase 2 starting");
    fetcher.reset();
    cnt = 0;
    while (fetcher.hasNext()) {
        String line = fetcher.nextSentence();
        cnt++;
    }
    assertEquals(97162, cnt);
}
Also used : File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 13 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class StreamLineIteratorTest method testHasNext.

@Test
public void testHasNext() throws Exception {
    ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
    File f = reuters5250.getFile();
    StreamLineIterator iterator = new StreamLineIterator.Builder(new FileInputStream(f)).setFetchSize(100).build();
    int cnt = 0;
    while (iterator.hasNext()) {
        String line = iterator.nextSentence();
        assertNotEquals(null, line);
        logger.info("Line: " + line);
        cnt++;
    }
    assertEquals(24, cnt);
}
Also used : File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 14 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class GloveTest method before.

@Before
public void before() throws Exception {
    ClassPathResource resource = new ClassPathResource("/raw_sentences.txt");
    File file = resource.getFile();
    iter = new LineSentenceIterator(file);
    iter.setPreProcessor(new SentencePreProcessor() {

        @Override
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });
}
Also used : LineSentenceIterator(org.deeplearning4j.text.sentenceiterator.LineSentenceIterator) SentencePreProcessor(org.deeplearning4j.text.sentenceiterator.SentencePreProcessor) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) Before(org.junit.Before)

Example 15 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testParagraphVectorsDM.

@Test
public void testParagraphVectorsDM() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    LabelsSource source = new LabelsSource("DOC_");
    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(2).seed(119).epochs(3).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).useHierarchicSoftmax(true).sampling(0).workers(1).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
    vec.fit();
    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");
    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);
    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);
    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);
    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);
    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);
    // testing DM inference now
    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) DM(org.deeplearning4j.models.embeddings.learning.impl.sequence.DM) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LabelsSource(org.deeplearning4j.text.documentiterator.LabelsSource) File(java.io.File) Test(org.junit.Test)

Aggregations

ClassPathResource (org.datavec.api.util.ClassPathResource)72 Test (org.junit.Test)63 File (java.io.File)45 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)28 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)27 INDArray (org.nd4j.linalg.api.ndarray.INDArray)24 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)23 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)23 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)20 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)11 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)10 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 ArrayList (java.util.ArrayList)9 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)8 DataSet (org.nd4j.linalg.dataset.DataSet)8 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 InputStream (java.io.InputStream)6