use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class ParagraphVectors method inferVector.
/**
* This method calculates inferred vector for given document
*
* @param document
* @return
*/
public INDArray inferVector(@NonNull List<VocabWord> document, double learningRate, double minLearningRate, int iterations) {
SequenceLearningAlgorithm<VocabWord> learner = sequenceLearningAlgorithm;
if (learner == null) {
synchronized (this) {
if (sequenceLearningAlgorithm == null) {
log.info("Creating new PV-DM learner...");
learner = new DM<VocabWord>();
learner.configure(vocab, lookupTable, configuration);
sequenceLearningAlgorithm = learner;
} else {
learner = sequenceLearningAlgorithm;
}
}
}
learner = sequenceLearningAlgorithm;
if (document.isEmpty())
throw new ND4JIllegalStateException("Impossible to apply inference to empty list of words");
Sequence<VocabWord> sequence = new Sequence<>();
sequence.addElements(document);
sequence.setSequenceLabel(new VocabWord(1.0, String.valueOf(new Random().nextInt())));
initLearners();
INDArray inf = learner.inferSequence(sequence, seed, learningRate, minLearningRate, iterations);
return inf;
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class BaseTextVectorizer method buildVocab.
public void buildVocab() {
if (vocabCache == null)
vocabCache = new AbstractCache.Builder<VocabWord>().build();
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(this.iterator).tokenizerFactory(tokenizerFactory).build();
AbstractSequenceIterator<VocabWord> iterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(iterator, minWordFrequency).setTargetVocabCache(vocabCache).setStopWords(stopWords).allowParallelTokenization(isParallel).build();
constructor.buildJointVocabulary(false, true);
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class SentenceTransformer method transformToSequence.
@Override
public Sequence<VocabWord> transformToSequence(String object) {
Sequence<VocabWord> sequence = new Sequence<>();
Tokenizer tokenizer = tokenizerFactory.create(object);
List<String> list = tokenizer.getTokens();
for (String token : list) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
VocabWord word = new VocabWord(1.0, token);
sequence.addElement(word);
}
sequence.setSequenceId(sentenceCounter.getAndIncrement());
return sequence;
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class VocabularyHolder method transferBackToVocabCache.
/**
* This method is required for compatibility purposes.
* It just transfers vocabulary from VocabHolder into VocabCache
*
* @param cache
*/
public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) {
if (!(cache instanceof InMemoryLookupCache))
throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented.");
// make sure that huffman codes are updated before transfer
//updateHuffmanCodes();
List<VocabularyWord> words = words();
for (VocabularyWord word : words) {
if (word.getWord().isEmpty())
continue;
VocabWord vocabWord = new VocabWord(1, word.getWord());
// if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient feature
if (word.getHistoricalGradient() != null) {
INDArray gradient = Nd4j.create(word.getHistoricalGradient());
vocabWord.setHistoricalGradient(gradient);
}
// put VocabWord into both Tokens and Vocabs maps
((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord);
((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord);
// update Huffman tree information
if (word.getHuffmanNode() != null) {
vocabWord.setIndex(word.getHuffmanNode().getIdx());
vocabWord.setCodeLength(word.getHuffmanNode().getLength());
vocabWord.setPoints(arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
vocabWord.setCodes(arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));
// put word into index
cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord());
}
// >1 hack is required since VocabCache impl imples 1 as base word count, not 0
if (word.getCount() > 1)
cache.incrementWordCount(word.getWord(), word.getCount() - 1);
}
// at this moment its pretty safe to nullify all vocabs.
if (emptyHolder) {
idxMap.clear();
vocabulary.clear();
}
}
use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.
the class ParagraphVectors method predictSeveral.
/**
* Predict several labels based on the document.
* Computes a similarity wrt the mean of the
* representation of words in the document
* @param rawText raw text of the document
* @return possible labels in descending order
*/
@Deprecated
public Collection<String> predictSeveral(String rawText, int limit) {
if (tokenizerFactory == null)
throw new IllegalStateException("TokenizerFactory should be defined, prior to predict() call");
List<String> tokens = tokenizerFactory.create(rawText).getTokens();
List<VocabWord> document = new ArrayList<>();
for (String token : tokens) {
if (vocab.containsWord(token)) {
document.add(vocab.wordFor(token));
}
}
return predictSeveral(document, limit);
}
Aggregations