Search in sources :

Example 66 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class ParagraphVectors method inferVector.

/**
     * This method calculates inferred vector for given document
     *
     * @param document
     * @return
     */
public INDArray inferVector(@NonNull List<VocabWord> document, double learningRate, double minLearningRate, int iterations) {
    SequenceLearningAlgorithm<VocabWord> learner = sequenceLearningAlgorithm;
    if (learner == null) {
        synchronized (this) {
            if (sequenceLearningAlgorithm == null) {
                log.info("Creating new PV-DM learner...");
                learner = new DM<VocabWord>();
                learner.configure(vocab, lookupTable, configuration);
                sequenceLearningAlgorithm = learner;
            } else {
                learner = sequenceLearningAlgorithm;
            }
        }
    }
    learner = sequenceLearningAlgorithm;
    if (document.isEmpty())
        throw new ND4JIllegalStateException("Impossible to apply inference to empty list of words");
    Sequence<VocabWord> sequence = new Sequence<>();
    sequence.addElements(document);
    sequence.setSequenceLabel(new VocabWord(1.0, String.valueOf(new Random().nextInt())));
    initLearners();
    INDArray inf = learner.inferSequence(sequence, seed, learningRate, minLearningRate, iterations);
    return inf;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) Sequence(org.deeplearning4j.models.sequencevectors.sequence.Sequence)

Example 67 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class BaseTextVectorizer method buildVocab.

public void buildVocab() {
    if (vocabCache == null)
        vocabCache = new AbstractCache.Builder<VocabWord>().build();
    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(this.iterator).tokenizerFactory(tokenizerFactory).build();
    AbstractSequenceIterator<VocabWord> iterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(iterator, minWordFrequency).setTargetVocabCache(vocabCache).setStopWords(stopWords).allowParallelTokenization(isParallel).build();
    constructor.buildJointVocabulary(false, true);
}
Also used : AbstractSequenceIterator(org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator) VocabConstructor(org.deeplearning4j.models.word2vec.wordstore.VocabConstructor) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) SentenceTransformer(org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)

Example 68 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class SentenceTransformer method transformToSequence.

@Override
public Sequence<VocabWord> transformToSequence(String object) {
    Sequence<VocabWord> sequence = new Sequence<>();
    Tokenizer tokenizer = tokenizerFactory.create(object);
    List<String> list = tokenizer.getTokens();
    for (String token : list) {
        if (token == null || token.isEmpty() || token.trim().isEmpty())
            continue;
        VocabWord word = new VocabWord(1.0, token);
        sequence.addElement(word);
    }
    sequence.setSequenceId(sentenceCounter.getAndIncrement());
    return sequence;
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) Sequence(org.deeplearning4j.models.sequencevectors.sequence.Sequence) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer)

Example 69 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class VocabularyHolder method transferBackToVocabCache.

/**
     * This method is required for compatibility purposes.
     *  It just transfers vocabulary from VocabHolder into VocabCache
     *
     * @param cache
     */
public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) {
    if (!(cache instanceof InMemoryLookupCache))
        throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented.");
    // make sure that huffman codes are updated before transfer
    //updateHuffmanCodes();
    List<VocabularyWord> words = words();
    for (VocabularyWord word : words) {
        if (word.getWord().isEmpty())
            continue;
        VocabWord vocabWord = new VocabWord(1, word.getWord());
        // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient feature
        if (word.getHistoricalGradient() != null) {
            INDArray gradient = Nd4j.create(word.getHistoricalGradient());
            vocabWord.setHistoricalGradient(gradient);
        }
        // put VocabWord into both Tokens and Vocabs maps
        ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord);
        ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord);
        // update Huffman tree information
        if (word.getHuffmanNode() != null) {
            vocabWord.setIndex(word.getHuffmanNode().getIdx());
            vocabWord.setCodeLength(word.getHuffmanNode().getLength());
            vocabWord.setPoints(arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
            vocabWord.setCodes(arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));
            // put word into index
            cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord());
        }
        // >1 hack is required since VocabCache impl imples 1 as base word count, not 0
        if (word.getCount() > 1)
            cache.incrementWordCount(word.getWord(), word.getCount() - 1);
    }
    // at this moment its pretty safe to nullify all vocabs.
    if (emptyHolder) {
        idxMap.clear();
        vocabulary.clear();
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)

Example 70 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class ParagraphVectors method predictSeveral.

/**
     * Predict several labels based on the document.
     * Computes a similarity wrt the mean of the
     * representation of words in the document
     * @param rawText raw text of the document
     * @return possible labels in descending order
     */
@Deprecated
public Collection<String> predictSeveral(String rawText, int limit) {
    if (tokenizerFactory == null)
        throw new IllegalStateException("TokenizerFactory should be defined, prior to predict() call");
    List<String> tokens = tokenizerFactory.create(rawText).getTokens();
    List<VocabWord> document = new ArrayList<>();
    for (String token : tokens) {
        if (vocab.containsWord(token)) {
            document.add(vocab.wordFor(token));
        }
    }
    return predictSeveral(document, limit);
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Aggregations

VocabWord (org.deeplearning4j.models.word2vec.VocabWord)110 Test (org.junit.Test)54 INDArray (org.nd4j.linalg.api.ndarray.INDArray)31 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)26 ClassPathResource (org.datavec.api.util.ClassPathResource)23 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)22 File (java.io.File)20 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)19 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)19 ArrayList (java.util.ArrayList)17 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)17 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)15 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)14 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)13 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)12 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)11 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)11 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)10