Search in sources :

Example 31 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class ManualTests method testWord2VecPlot.

@Test
public void testWord2VecPlot() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025).layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10).tokenizerFactory(t).build();
    vec.fit();
    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
    //        vec.getLookupTable().plotVocab(100, connectionInfo);
    Thread.sleep(10000000000L);
    fail("Not implemented");
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) BasicModelUtils(org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) File(java.io.File) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Test(org.junit.Test)

Example 32 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class GloveTest method testGlove.

@Test
public void testGlove() throws Exception {
    Glove glove = new Glove(true, 5, 100);
    JavaRDD<String> corpus = sc.textFile(new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath()).map(new Function<String, String>() {

        @Override
        public String call(String s) throws Exception {
            return s.toLowerCase();
        }
    });
    Pair<VocabCache<VocabWord>, GloveWeightLookupTable> table = glove.train(corpus);
    WordVectors vectors = WordVectorSerializer.fromPair(new Pair<>((InMemoryLookupTable) table.getSecond(), (VocabCache) table.getFirst()));
    Collection<String> words = vectors.wordsNearest("day", 20);
    assertTrue(words.contains("week"));
}
Also used : GloveWeightLookupTable(org.deeplearning4j.models.glove.GloveWeightLookupTable) ClassPathResource(org.datavec.api.util.ClassPathResource) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) Test(org.junit.Test) BaseSparkTest(org.deeplearning4j.spark.text.BaseSparkTest)

Example 33 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class Word2VecTest method testSparkW2VonBiggerCorpus.

@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest").set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g").set("spark.executor.memory", "8g");
    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    // Path of data part-00000
    //String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());
    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).useUnknown(true).build();
    word2Vec.train(corpus);
    sc.stop();
    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
Also used : DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) LowCasePreProcessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.LowCasePreProcessor) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) ClassPathResource(org.datavec.api.util.ClassPathResource) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 34 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.

the class Word2VecTest method testConcepts.

@Test
public void testConcepts() throws Exception {
    // These are all default values for word2vec
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest");
    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    // Path of data part-00000
    String dataPath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
    //        dataPath = "/ext/Temp/part-00000";
    //        String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).stopWords(Arrays.asList("three")).useUnknown(true).build();
    word2Vec.train(corpus);
    //word2Vec.setModelUtils(new FlatModelUtils());
    System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK"));
    InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable();
    double sim = word2Vec.similarity("day", "night");
    System.out.println("day/night similarity: " + sim);
    /*
        System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce"));
        System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro"));
        
        Collection<String> portu = word2Vec.wordsNearest("carro", 10);
        printWords("carro", portu, word2Vec);
        
        portu = word2Vec.wordsNearest("davi", 10);
        printWords("davi", portu, word2Vec);
        
        System.out.println("---------------------------------------");
        */
    Collection<String> words = word2Vec.wordsNearest("day", 10);
    printWords("day", words, word2Vec);
    assertTrue(words.contains("night"));
    assertTrue(words.contains("week"));
    assertTrue(words.contains("year"));
    sim = word2Vec.similarity("two", "four");
    System.out.println("two/four similarity: " + sim);
    words = word2Vec.wordsNearest("two", 10);
    printWords("two", words, word2Vec);
    // three should be absent due to stopWords
    assertFalse(words.contains("three"));
    assertTrue(words.contains("five"));
    assertTrue(words.contains("four"));
    sc.stop();
    // test serialization
    File tempFile = File.createTempFile("temp", "tmp");
    tempFile.deleteOnExit();
    int idx1 = word2Vec.vocab().wordFor("day").getIndex();
    INDArray array1 = word2Vec.getWordVectorMatrix("day").dup();
    VocabWord word1 = word2Vec.vocab().elementAtIndex(0);
    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile);
    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
    VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0);
    VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it");
    int idx2 = vectors.vocab().wordFor("day").getIndex();
    INDArray array2 = vectors.getWordVectorMatrix("day").dup();
    System.out.println("word 'i': " + word2);
    System.out.println("word 'it': " + wordIT);
    assertEquals(idx1, idx2);
    assertEquals(word1, word2);
    assertEquals(array1, array2);
}
Also used : TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ClassPathResource(org.datavec.api.util.ClassPathResource) DefaultTokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory) CommonPreprocessor(org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) WordVectors(org.deeplearning4j.models.embeddings.wordvectors.WordVectors) SparkConf(org.apache.spark.SparkConf) File(java.io.File) Test(org.junit.Test)

Example 35 with ClassPathResource

use of org.datavec.api.util.ClassPathResource in project tutorials by eugenp.

the class IrisClassifier method main.

public static void main(String[] args) throws IOException, InterruptedException {
    DataSet allData;
    try (RecordReader recordReader = new CSVRecordReader(0, ',')) {
        recordReader.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile()));
        DataSetIterator iterator = new RecordReaderDataSetIterator(recordReader, 150, FEATURES_COUNT, CLASSES_COUNT);
        allData = iterator.next();
    }
    allData.shuffle(42);
    DataNormalization normalizer = new NormalizerStandardize();
    normalizer.fit(allData);
    normalizer.transform(allData);
    SplitTestAndTrain testAndTrain = allData.splitTestAndTrain(0.65);
    DataSet trainingData = testAndTrain.getTrain();
    DataSet testData = testAndTrain.getTest();
    MultiLayerConfiguration configuration = new NeuralNetConfiguration.Builder().iterations(1000).activation(Activation.TANH).weightInit(WeightInit.XAVIER).learningRate(0.1).regularization(true).l2(0.0001).list().layer(0, new DenseLayer.Builder().nIn(FEATURES_COUNT).nOut(3).build()).layer(1, new DenseLayer.Builder().nIn(3).nOut(3).build()).layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD).activation(Activation.SOFTMAX).nIn(3).nOut(CLASSES_COUNT).build()).backprop(true).pretrain(false).build();
    MultiLayerNetwork model = new MultiLayerNetwork(configuration);
    model.init();
    model.fit(trainingData);
    INDArray output = model.output(testData.getFeatureMatrix());
    Evaluation eval = new Evaluation(CLASSES_COUNT);
    eval.eval(testData.getLabels(), output);
    System.out.println(eval.stats());
}
Also used : Evaluation(org.deeplearning4j.eval.Evaluation) DataSet(org.nd4j.linalg.dataset.DataSet) RecordReader(org.datavec.api.records.reader.RecordReader) CSVRecordReader(org.datavec.api.records.reader.impl.csv.CSVRecordReader) RecordReaderDataSetIterator(org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) FileSplit(org.datavec.api.split.FileSplit) ClassPathResource(org.datavec.api.util.ClassPathResource) DataNormalization(org.nd4j.linalg.dataset.api.preprocessor.DataNormalization) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CSVRecordReader(org.datavec.api.records.reader.impl.csv.CSVRecordReader) NormalizerStandardize(org.nd4j.linalg.dataset.api.preprocessor.NormalizerStandardize) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) RecordReaderDataSetIterator(org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator) SplitTestAndTrain(org.nd4j.linalg.dataset.SplitTestAndTrain)

Aggregations

ClassPathResource (org.datavec.api.util.ClassPathResource)72 Test (org.junit.Test)63 File (java.io.File)45 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)28 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)27 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)27 INDArray (org.nd4j.linalg.api.ndarray.INDArray)24 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)23 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)23 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)20 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)12 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)11 WordVectors (org.deeplearning4j.models.embeddings.wordvectors.WordVectors)10 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)10 ArrayList (java.util.ArrayList)9 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)8 DataSet (org.nd4j.linalg.dataset.DataSet)8 AggregatingSentenceIterator (org.deeplearning4j.text.sentenceiterator.AggregatingSentenceIterator)7 FileSentenceIterator (org.deeplearning4j.text.sentenceiterator.FileSentenceIterator)7 InputStream (java.io.InputStream)6