Search in sources :

Example 6 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class VocabConstructorTest method testVocab.

@Test
public void testVocab() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    Set<String> set = new HashSet<>();
    int lines = 0;
    int cnt = 0;
    while (iter.hasNext()) {
        Tokenizer tok = t.create(iter.nextSentence());
        for (String token : tok.getTokens()) {
            if (token == null || token.isEmpty() || token.trim().isEmpty())
                continue;
            cnt++;
            if (!set.contains(token))
                set.add(token);
        }
        lines++;
    }
    log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
    log.info("Set:\n" + set);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) File(java.io.File) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Test(org.junit.Test)

Example 7 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class SequenceVectorsTest method testInternalVocabConstruction.

@Test
public void testInternalVocabConstruction() throws Exception {
    ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
    File file = resource.getFile();
    BasicLineIterator underlyingIterator = new BasicLineIterator(file);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
    AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration()).minWordFrequency(5).iterate(sequenceIterator).batchSize(250).iterations(1).epochs(1).resetModel(false).trainElementsRepresentation(true).build();
    logger.info("Fitting model...");
    vectors.fit();
    logger.info("Model ready...");
    double sim = vectors.similarity("day", "night");
    logger.info("Day/night similarity: " + sim);
    assertTrue(sim > 0.6d);
    Collection<String> labels = vectors.wordsNearest("day", 10);
    logger.info("Nearest labels to 'day': " + labels);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VectorsConfiguration(org.deeplearning4j.models.embeddings.loader.VectorsConfiguration) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) ClassPathResource(org.datavec.api.util.ClassPathResource) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) File(java.io.File) Test(org.junit.Test)

Example 8 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class AsyncLabelAwareIteratorTest method nextDocument.

@Test
public void nextDocument() throws Exception {
    SentenceIterator sentence = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
    BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();
    int cnt = 0;
    while (backed.hasNextDocument()) {
        backed.nextDocument();
        cnt++;
    }
    assertEquals(97162, cnt);
    backed.reset();
    AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
    cnt = 0;
    while (iterator.hasNext()) {
        iterator.next();
        cnt++;
        if (cnt == 10)
            iterator.reset();
    }
    assertEquals(97172, cnt);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 9 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class BasicLabelAwareIteratorTest method testHasNextDocument2.

@Test
public void testHasNextDocument2() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();
    int cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }
    assertEquals(97162, cnt);
    iterator.reset();
    cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }
    assertEquals(97162, cnt);
    LabelsSource generator = iterator.getLabelsSource();
    // this is important moment. Iterator after reset should not increase number of labels attained
    assertEquals(97162, generator.getLabels().size());
    assertEquals("DOCZ_0", generator.getLabels().get(0));
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Test(org.junit.Test)

Example 10 with BasicLineIterator

use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.

the class ManualTests method testWord2VecPlot.

@Test
public void testWord2VecPlot() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025).layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10).tokenizerFactory(t).build();
    vec.fit();
    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
    //        vec.getLookupTable().plotVocab(100, connectionInfo);
    Thread.sleep(10000000000L);
    fail("Not implemented");
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) BasicModelUtils(org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Test(org.junit.Test)

Aggregations

BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)36 Test (org.junit.Test)34 ClassPathResource (org.datavec.api.util.ClassPathResource)27 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)24 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)24 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)23 File (java.io.File)22 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)19 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)11 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)7 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 UimaSentenceIterator (org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator)7 ArrayList (java.util.ArrayList)6 BasicModelUtils (org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils)5 LabelsSource (org.deeplearning4j.text.documentiterator.LabelsSource)5