Search in sources :

Example 71 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method writeVocabCache.

/**
     * This method saves vocab cache to provided OutputStream.
     * Please note: it saves only vocab content, so it's suitable mostly for BagOfWords/TF-IDF vectorizers
     *
     * @param vocabCache
     * @param stream
     * @throws UnsupportedEncodingException
     */
public static void writeVocabCache(@NonNull VocabCache<VocabWord> vocabCache, @NonNull OutputStream stream) throws IOException {
    PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(stream, "UTF-8")));
    for (int x = 0; x < vocabCache.numWords(); x++) {
        VocabWord word = vocabCache.elementAtIndex(x);
        writer.println(word.toJSON());
    }
    writer.flush();
    writer.close();
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Example 72 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readVocabCache.

/**
     * This method reads vocab cache from provided InputStream.
     * Please note: it reads only vocab content, so it's suitable mostly for BagOfWords/TF-IDF vectorizers
     *
     * @param stream
     * @return
     * @throws IOException
     */
public static VocabCache<VocabWord> readVocabCache(@NonNull InputStream stream) throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
    AbstractCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
    VocabWordFactory factory = new VocabWordFactory();
    String line = "";
    while ((line = reader.readLine()) != null) {
        VocabWord word = factory.deserialize(line);
        vocabCache.addToken(word);
        vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
    }
    return vocabCache;
}
Also used : VocabWordFactory(org.deeplearning4j.models.sequencevectors.serialization.VocabWordFactory) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)

Example 73 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class InMemoryLookupCache method importVocabulary.

@Override
public void importVocabulary(VocabCache<VocabWord> vocabCache) {
    for (VocabWord word : vocabCache.vocabWords()) {
        if (vocabs.containsKey(word.getLabel())) {
            wordFrequencies.incrementCount(word.getLabel(), word.getElementFrequency());
        } else {
            tokens.put(word.getLabel(), word);
            vocabs.put(word.getLabel(), word);
            wordFrequencies.incrementCount(word.getLabel(), word.getElementFrequency());
        }
        totalWordOccurrences.addAndGet((long) word.getElementFrequency());
    }
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Example 74 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class InMemoryLookupCache method putVocabWord.

/**
     * @param word
     */
@Override
@Deprecated
public synchronized void putVocabWord(String word) {
    if (word == null || word.isEmpty())
        throw new IllegalArgumentException("Word can't be empty or null");
    // STOP and UNK are not added as tokens
    if (word.equals("STOP") || word.equals("UNK"))
        return;
    VocabWord token = tokenFor(word);
    if (token == null)
        throw new IllegalStateException("Word " + word + " not found as token in vocab");
    int ind = token.getIndex();
    addWordToIndex(ind, word);
    if (!hasToken(word))
        throw new IllegalStateException("Unable to add token " + word + " when not already a token");
    vocabs.put(word, token);
    wordIndex.add(word, token.getIndex());
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Example 75 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class InMemoryLookupCache method addWordToIndex.

/**
     * @param index
     * @param word
     */
@Override
public synchronized void addWordToIndex(int index, String word) {
    if (word == null || word.isEmpty())
        throw new IllegalArgumentException("Word can't be empty or null");
    if (!tokens.containsKey(word)) {
        VocabWord token = new VocabWord(1.0, word);
        tokens.put(word, token);
        wordFrequencies.incrementCount(word, 1.0);
    }
    /*
            If we're speaking about adding any word to index directly, it means it's going to be vocab word, not token
         */
    if (!vocabs.containsKey(word)) {
        VocabWord vw = tokenFor(word);
        vw.setIndex(index);
        vocabs.put(word, vw);
        vw.setIndex(index);
    }
    if (!wordFrequencies.containsKey(word))
        wordFrequencies.incrementCount(word, 1);
    wordIndex.add(word, index);
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Aggregations

VocabWord (org.deeplearning4j.models.word2vec.VocabWord)110 Test (org.junit.Test)54 INDArray (org.nd4j.linalg.api.ndarray.INDArray)31 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)26 ClassPathResource (org.datavec.api.util.ClassPathResource)23 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)22 File (java.io.File)20 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)19 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)19 ArrayList (java.util.ArrayList)17 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)17 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)15 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)14 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)13 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)12 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)11 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)11 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)10