Search in sources :

Example 41 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class WordVectorSerializerTest method testOutputStream.

@Test
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0).cache(cache).lr(0.025f).build();
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).vocabCache(cache).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    INDArray day1 = vec.getWordVectorMatrix("day");
    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));
    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);
    INDArray day2 = vec2.getWordVectorMatrix("day");
    assertEquals(day1, day2);
    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();
    WordVectorSerializer.writeWord2VecModel(vec, tempFile);
    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) UimaSentenceIterator(org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) FileOutputStream(java.io.FileOutputStream) WeightLookupTable(org.deeplearning4j.models.embeddings.WeightLookupTable) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Test(org.junit.Test)

Example 42 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class Word2VecIteratorTest method testLabeledExample.

@Test
public void testLabeledExample() throws Exception {
    INDArray unk = vec.getWordVectorMatrix(Word2Vec.DEFAULT_UNK);
    assertNotEquals(null, unk);
    unk = vec.getWordVectorMatrix("2131241sdasdas");
    assertNotEquals(null, unk);
    Word2VecDataSetIterator iter = new Word2VecDataSetIterator(vec, new LabelAwareFileSentenceIterator(null, new ClassPathResource("labeled/").getFile()), Arrays.asList("negative", "positive", "neutral"));
    DataSet next = iter.next();
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) ClassPathResource(org.datavec.api.util.ClassPathResource) LabelAwareFileSentenceIterator(org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareFileSentenceIterator) Test(org.junit.Test)

Example 43 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class SentenceIteratorTest method testLabelAware.

@Test
public void testLabelAware() throws Exception {
    String s = "1; hello";
    ByteArrayInputStream bis = new ByteArrayInputStream(s.getBytes());
    LabelAwareSentenceIterator labelAwareSentenceIterator = new LabelAwareListSentenceIterator(bis, ";", 0, 1);
    assertTrue(labelAwareSentenceIterator.hasNext());
    labelAwareSentenceIterator.nextSentence();
    assertEquals("1", labelAwareSentenceIterator.currentLabel());
    InputStream is2 = new ClassPathResource("labelawaresentenceiterator.txt").getInputStream();
    LabelAwareSentenceIterator labelAwareSentenceIterator2 = new LabelAwareListSentenceIterator(is2, ";", 0, 1);
    int count = 0;
    Map<Integer, String> labels = new HashMap<>();
    while (labelAwareSentenceIterator2.hasNext()) {
        String sentence = labelAwareSentenceIterator2.nextSentence();
        labels.put(count, labelAwareSentenceIterator2.currentLabel());
        count++;
    }
    assertEquals("SENT37", labels.get(0));
    assertEquals("SENT38", labels.get(1));
    assertEquals("SENT39", labels.get(2));
    assertEquals("SENT42", labels.get(3));
    assertEquals(4, count);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) HashMap(java.util.HashMap) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) LabelAwareSentenceIterator(org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareSentenceIterator) LabelAwareListSentenceIterator(org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareListSentenceIterator) ClassPathResource(org.datavec.api.util.ClassPathResource) Test(org.junit.Test)

Example 44 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class GloveTest method testGloVe1.

@Ignore
@Test
public void testGloVe1() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Glove glove = new Glove.Builder().iterate(iter).tokenizerFactory(t).alpha(0.75).learningRate(0.1).epochs(45).xMax(100).shuffle(true).symmetric(true).build();
    glove.fit();
    double simD = glove.similarity("day", "night");
    double simP = glove.similarity("best", "police");
    log.info("Day/night similarity: " + simD);
    log.info("Best/police similarity: " + simP);
    Collection<String> words = glove.wordsNearest("day", 10);
    log.info("Nearest words to 'day': " + words);
    assertTrue(simD > 0.7);
    // actually simP should be somewhere at 0
    assertTrue(simP < 0.5);
    assertTrue(words.contains("night"));
    assertTrue(words.contains("year"));
    assertTrue(words.contains("week"));
    File tempFile = File.createTempFile("glove", "temp");
    tempFile.deleteOnExit();
    INDArray day1 = glove.getWordVectorMatrix("day").dup();
    WordVectorSerializer.writeWordVectors(glove, tempFile);
    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
    INDArray day2 = vectors.getWordVectorMatrix("day").dup();
    assertEquals(day1, day2);
    tempFile.delete();
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) LineSentenceIterator(org.deeplearning4j.text.sentenceiterator.LineSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) INDArray(org.nd4j.linalg.api.ndarray.INDArray) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 45 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class ParagraphVectorsTest method testDirectInference.

@Test
public void testDirectInference() throws Exception {
    ClassPathResource resource_sentences = new ClassPathResource("/big/raw_sentences.txt");
    ClassPathResource resource_mixed = new ClassPathResource("/paravec");
    SentenceIterator iter = new AggregatingSentenceIterator.Builder().addSentenceIterator(new BasicLineIterator(resource_sentences.getFile())).addSentenceIterator(new FileSentenceIterator(resource_mixed.getFile())).build();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(3).learningRate(0.025).layerSize(150).minLearningRate(0.001).elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5).iterate(iter).tokenizerFactory(t).build();
    wordVectors.fit();
    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors).negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) SkipGram(org.deeplearning4j.models.embeddings.learning.impl.elements.SkipGram) DM(org.deeplearning4j.models.embeddings.learning.impl.sequence.DM) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) AggregatingSentenceIterator(org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) FileSentenceIterator(org.deeplearning4j.text.sentenceiterator.FileSentenceIterator) Test(org.junit.Test)

Aggregations

ClassPathResource (org.datavec.api.util.ClassPathResource)72 Test (org.junit.Test)63 File (java.io.File)45 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)28 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)27 INDArray (org.nd4j.linalg.api.ndarray.INDArray)24 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)23 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)23 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)20 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)11 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)10 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 ArrayList (java.util.ArrayList)9 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)8 DataSet (org.nd4j.linalg.dataset.DataSet)8 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 InputStream (java.io.InputStream)6