use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class VocabConstructorTest method testMergedVocabWithLabels1.
@Test
public void testMergedVocabWithLabels1() throws Exception {
AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();
AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
BasicLineIterator underlyingIterator = new BasicLineIterator(resource.getFile());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();
vocabConstructor.buildJointVocabulary(false, true);
int sourceSize = cacheSource.numWords();
log.info("Source Vocab size: " + sourceSize);
FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder().addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();
sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();
vocabTransfer.buildMergedVocabulary(cacheSource, true);
// those +3 go for 3 additional entries in target VocabCache: labels
assertEquals(sourceSize + 3, cacheTarget.numWords());
// now we check index equality for transferred elements
assertEquals(cacheSource.wordAtIndex(17), cacheTarget.wordAtIndex(17));
assertEquals(cacheSource.wordAtIndex(45), cacheTarget.wordAtIndex(45));
assertEquals(cacheSource.wordAtIndex(89), cacheTarget.wordAtIndex(89));
// we check that newly added labels have indexes beyond the VocabCache index space
// please note, we need >= since the indexes are zero-based, and sourceSize is not
assertTrue(cacheTarget.indexOf("Zfinance") > sourceSize - 1);
assertTrue(cacheTarget.indexOf("Zscience") > sourceSize - 1);
assertTrue(cacheTarget.indexOf("Zhealth") > sourceSize - 1);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class AbstractCacheTest method testWordsOccurencies.
@Test
public void testWordsOccurencies() throws Exception {
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
cache.addToken(new VocabWord(1.0, "word"));
cache.addToken(new VocabWord(2.0, "test"));
cache.addToken(new VocabWord(3.0, "tester"));
assertEquals(3, cache.numWords());
assertEquals(6, cache.totalWordOccurrences());
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class SequenceVectorsTest method testAbstractW2VModel.
@Test
public void testAbstractW2VModel() throws Exception {
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
File file = resource.getFile();
logger.info("dtype: {}", Nd4j.dataType());
AbstractCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
/*
First we build line iterator
*/
BasicLineIterator underlyingIterator = new BasicLineIterator(file);
/*
Now we need the way to convert lines into Sequences of VocabWords.
In this example that's SentenceTransformer
*/
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
/*
And we pack that transformer into AbstractSequenceIterator
*/
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
/*
Now we should build vocabulary out of sequence iterator.
We can skip this phase, and just set SequenceVectors.resetModel(TRUE), and vocabulary will be mastered internally
*/
VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 5).setTargetVocabCache(vocabCache).build();
constructor.buildJointVocabulary(false, true);
assertEquals(242, vocabCache.numWords());
assertEquals(634303, vocabCache.totalWordOccurrences());
VocabWord wordz = vocabCache.wordFor("day");
logger.info("Wordz: " + wordz);
/*
Time to build WeightLookupTable instance for our new model
*/
WeightLookupTable<VocabWord> lookupTable = new InMemoryLookupTable.Builder<VocabWord>().lr(0.025).vectorLength(150).useAdaGrad(false).cache(vocabCache).build();
/*
reset model is viable only if you're setting SequenceVectors.resetModel() to false
if set to True - it will be called internally
*/
lookupTable.resetWeights(true);
/*
Now we can build SequenceVectors model, that suits our needs
*/
SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration()).minWordFrequency(5).lookupTable(lookupTable).iterate(sequenceIterator).vocabCache(vocabCache).batchSize(250).iterations(1).epochs(1).resetModel(false).trainElementsRepresentation(true).trainSequencesRepresentation(false).build();
/*
Now, after all options are set, we just call fit()
*/
logger.info("Starting training...");
vectors.fit();
logger.info("Model saved...");
/*
As soon as fit() exits, model considered built, and we can test it.
Please note: all similarity context is handled via SequenceElement's labels, so if you're using SequenceVectors to build models for complex
objects/relations please take care of Labels uniqueness and meaning for yourself.
*/
double sim = vectors.similarity("day", "night");
logger.info("Day/night similarity: " + sim);
assertTrue(sim > 0.6d);
Collection<String> labels = vectors.wordsNearest("day", 10);
logger.info("Nearest labels to 'day': " + labels);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class SequenceVectorsTest method testGlove1.
@Ignore
@Test
public void testGlove1() throws Exception {
logger.info("Max available memory: " + Runtime.getRuntime().maxMemory());
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
File file = resource.getFile();
BasicLineIterator underlyingIterator = new BasicLineIterator(file);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VectorsConfiguration configuration = new VectorsConfiguration();
configuration.setWindow(5);
configuration.setLearningRate(0.06);
configuration.setLayersSize(100);
SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(configuration).iterate(sequenceIterator).iterations(1).epochs(45).elementsLearningAlgorithm(new GloVe.Builder<VocabWord>().shuffle(true).symmetric(true).learningRate(0.05).alpha(0.75).xMax(100.0).build()).resetModel(true).trainElementsRepresentation(true).trainSequencesRepresentation(false).build();
vectors.fit();
double sim = vectors.similarity("day", "night");
logger.info("Day/night similarity: " + sim);
sim = vectors.similarity("day", "another");
logger.info("Day/another similarity: " + sim);
sim = vectors.similarity("night", "year");
logger.info("Night/year similarity: " + sim);
sim = vectors.similarity("night", "me");
logger.info("Night/me similarity: " + sim);
sim = vectors.similarity("day", "know");
logger.info("Day/know similarity: " + sim);
sim = vectors.similarity("best", "police");
logger.info("Best/police similarity: " + sim);
Collection<String> labels = vectors.wordsNearest("day", 10);
logger.info("Nearest labels to 'day': " + labels);
sim = vectors.similarity("day", "night");
assertTrue(sim > 0.6d);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class PopularityWalkerTest method testPopularityWalker4.
@Test
public void testPopularityWalker4() throws Exception {
GraphWalker<VocabWord> walker = new PopularityWalker.Builder<>(graph).setWalkDirection(WalkDirection.FORWARD_ONLY).setNoEdgeHandling(NoEdgeHandling.CUTOFF_ON_DISCONNECTED).setWalkLength(10).setPopularityMode(PopularityMode.MINIMUM).setPopularitySpread(3).setSpreadSpectrum(SpreadSpectrum.PROPORTIONAL).build();
System.out.println("Connected [3] size: " + graph.getConnectedVertices(3).size());
System.out.println("Connected [4] size: " + graph.getConnectedVertices(4).size());
AtomicBoolean got3 = new AtomicBoolean(false);
AtomicBoolean got8 = new AtomicBoolean(false);
AtomicBoolean got9 = new AtomicBoolean(false);
for (int i = 0; i < 50; i++) {
Sequence<VocabWord> sequence = walker.next();
assertEquals("0", sequence.getElements().get(0).getLabel());
System.out.println("Position at 1: [" + sequence.getElements().get(1).getLabel() + "]");
got3.compareAndSet(false, sequence.getElements().get(1).getLabel().equals("3"));
got8.compareAndSet(false, sequence.getElements().get(1).getLabel().equals("8"));
got9.compareAndSet(false, sequence.getElements().get(1).getLabel().equals("9"));
assertTrue(sequence.getElements().get(1).getLabel().equals("8") || sequence.getElements().get(1).getLabel().equals("3") || sequence.getElements().get(1).getLabel().equals("9"));
walker.reset(false);
}
assertTrue(got3.get());
assertTrue(got8.get());
assertTrue(got9.get());
}
Aggregations