Search in sources :

Example 66 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class BasicLineIteratorTest method testHasMoreLinesStream.

@Test
public void testHasMoreLinesStream() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    //.getParentFile();
    File file = resource.getFile();
    BasicLineIterator iterator = new BasicLineIterator(new FileInputStream(file));
    int cnt = 0;
    while (iterator.hasNext()) {
        String line = iterator.nextSentence();
        cnt++;
    }
    assertEquals(97162, cnt);
    iterator.reset();
    cnt = 0;
    while (iterator.hasNext()) {
        String line = iterator.nextSentence();
        cnt++;
    }
    assertEquals(97162, cnt);
}
Also used : File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 67 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class BasicLineIteratorTest method testHasMoreLinesFile.

@Test
public void testHasMoreLinesFile() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    BasicLineIterator iterator = new BasicLineIterator(file);
    int cnt = 0;
    while (iterator.hasNext()) {
        String line = iterator.nextSentence();
        cnt++;
    }
    assertEquals(97162, cnt);
    iterator.reset();
    cnt = 0;
    while (iterator.hasNext()) {
        String line = iterator.nextSentence();
        cnt++;
    }
    assertEquals(97162, cnt);
}
Also used : File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 68 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class MutipleEpochsSentenceIteratorTest method hasNext.

@Test
public void hasNext() throws Exception {
    SentenceIterator iterator = new MutipleEpochsSentenceIterator(new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile()), 100);
    int cnt = 0;
    while (iterator.hasNext()) {
        iterator.nextSentence();
        cnt++;
    }
    assertEquals(9716200, cnt);
}
Also used : ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 69 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class PrefetchingSentenceIteratorTest method testPerformance1.

@Test
public void testPerformance1() throws Exception {
    ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
    File file = resource.getFile();
    BasicLineIterator iterator = new BasicLineIterator(file);
    PrefetchingSentenceIterator fetcher = new PrefetchingSentenceIterator.Builder(new BasicLineIterator(file)).setFetchSize(500000).build();
    long time01 = System.currentTimeMillis();
    int cnt0 = 0;
    while (iterator.hasNext()) {
        iterator.nextSentence();
        cnt0++;
    }
    long time02 = System.currentTimeMillis();
    long time11 = System.currentTimeMillis();
    int cnt1 = 0;
    while (fetcher.hasNext()) {
        fetcher.nextSentence();
        cnt1++;
    }
    long time12 = System.currentTimeMillis();
    log.info("Basic iterator: " + (time02 - time01));
    log.info("Prefetched iterator: " + (time12 - time11));
    long difference = (time12 - time11) - (time02 - time01);
    log.info("Difference: " + difference);
    // on small corpus time difference can fluctuate a lot
    // but it's still can be used as effectiveness measurement
    assertTrue(difference < 150);
}
Also used : File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 70 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class DefaulTokenizerTests method testDefaultTokenizer2.

@Test
public void testDefaultTokenizer2() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    tokenizer2.countTokens();
    while (tokenizer.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        assertEquals(tok1, tok2);
    }
    System.out.println("-----------------------------------------------");
    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: " + Math.abs(stringCount - stringCount2));
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) ByteArrayInputStream(java.io.ByteArrayInputStream) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Aggregations

ClassPathResource (org.datavec.api.util.ClassPathResource)71 Test (org.junit.Test)63 File (java.io.File)45 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)28 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)27 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)23 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)23 INDArray (org.nd4j.linalg.api.ndarray.INDArray)23 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)20 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)11 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)10 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 ArrayList (java.util.ArrayList)9 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)8 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 DataSet (org.nd4j.linalg.dataset.DataSet)7 InputStream (java.io.InputStream)6