use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class WordVectorSerializerTest method testOutputStream.
@Test
public void testOutputStream() throws Exception {
File file = File.createTempFile("tmp_ser", "ssa");
file.deleteOnExit();
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache(false);
WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0).cache(cache).lr(0.025f).build();
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100).lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).vocabCache(cache).seed(42).windowSize(5).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
INDArray day1 = vec.getWordVectorMatrix("day");
WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);
INDArray day2 = vec2.getWordVectorMatrix("day");
assertEquals(day1, day2);
File tempFile = File.createTempFile("tetsts", "Fdfs");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class Word2VecIteratorTest method testLabeledExample.
@Test
public void testLabeledExample() throws Exception {
INDArray unk = vec.getWordVectorMatrix(Word2Vec.DEFAULT_UNK);
assertNotEquals(null, unk);
unk = vec.getWordVectorMatrix("2131241sdasdas");
assertNotEquals(null, unk);
Word2VecDataSetIterator iter = new Word2VecDataSetIterator(vec, new LabelAwareFileSentenceIterator(null, new ClassPathResource("labeled/").getFile()), Arrays.asList("negative", "positive", "neutral"));
DataSet next = iter.next();
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class SentenceIteratorTest method testLabelAware.
@Test
public void testLabelAware() throws Exception {
String s = "1; hello";
ByteArrayInputStream bis = new ByteArrayInputStream(s.getBytes());
LabelAwareSentenceIterator labelAwareSentenceIterator = new LabelAwareListSentenceIterator(bis, ";", 0, 1);
assertTrue(labelAwareSentenceIterator.hasNext());
labelAwareSentenceIterator.nextSentence();
assertEquals("1", labelAwareSentenceIterator.currentLabel());
InputStream is2 = new ClassPathResource("labelawaresentenceiterator.txt").getInputStream();
LabelAwareSentenceIterator labelAwareSentenceIterator2 = new LabelAwareListSentenceIterator(is2, ";", 0, 1);
int count = 0;
Map<Integer, String> labels = new HashMap<>();
while (labelAwareSentenceIterator2.hasNext()) {
String sentence = labelAwareSentenceIterator2.nextSentence();
labels.put(count, labelAwareSentenceIterator2.currentLabel());
count++;
}
assertEquals("SENT37", labels.get(0));
assertEquals("SENT38", labels.get(1));
assertEquals("SENT39", labels.get(2));
assertEquals("SENT42", labels.get(3));
assertEquals(4, count);
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class GloveTest method testGloVe1.
@Ignore
@Test
public void testGloVe1() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Glove glove = new Glove.Builder().iterate(iter).tokenizerFactory(t).alpha(0.75).learningRate(0.1).epochs(45).xMax(100).shuffle(true).symmetric(true).build();
glove.fit();
double simD = glove.similarity("day", "night");
double simP = glove.similarity("best", "police");
log.info("Day/night similarity: " + simD);
log.info("Best/police similarity: " + simP);
Collection<String> words = glove.wordsNearest("day", 10);
log.info("Nearest words to 'day': " + words);
assertTrue(simD > 0.7);
// actually simP should be somewhere at 0
assertTrue(simP < 0.5);
assertTrue(words.contains("night"));
assertTrue(words.contains("year"));
assertTrue(words.contains("week"));
File tempFile = File.createTempFile("glove", "temp");
tempFile.deleteOnExit();
INDArray day1 = glove.getWordVectorMatrix("day").dup();
WordVectorSerializer.writeWordVectors(glove, tempFile);
WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
INDArray day2 = vectors.getWordVectorMatrix("day").dup();
assertEquals(day1, day2);
tempFile.delete();
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testDirectInference.
@Test
public void testDirectInference() throws Exception {
ClassPathResource resource_sentences = new ClassPathResource("/big/raw_sentences.txt");
ClassPathResource resource_mixed = new ClassPathResource("/paravec");
SentenceIterator iter = new AggregatingSentenceIterator.Builder().addSentenceIterator(new BasicLineIterator(resource_sentences.getFile())).addSentenceIterator(new FileSentenceIterator(resource_mixed.getFile())).build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(3).learningRate(0.025).layerSize(150).minLearningRate(0.001).elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5).iterate(iter).tokenizerFactory(t).build();
wordVectors.fit();
ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors).negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
INDArray vec1 = pv.inferVector("This text is pretty awesome");
INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
Aggregations