Search in sources :

Example 36 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class AbstractCacheTest method testNumWords.

@Test
public void testNumWords() throws Exception {
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    cache.addToken(new VocabWord(1.0, "word"));
    cache.addToken(new VocabWord(1.0, "test"));
    assertEquals(2, cache.numWords());
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) Test(org.junit.Test)

Example 37 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class TokenizerFunction method call.

@Override
public Sequence<VocabWord> call(String s) throws Exception {
    if (tokenizerFactory == null)
        instantiateTokenizerFactory();
    List<String> tokens = tokenizerFactory.create(s).getTokens();
    Sequence<VocabWord> seq = new Sequence<>();
    for (String token : tokens) {
        if (token == null || token.isEmpty())
            continue;
        seq.addElement(new VocabWord(1.0, token));
    }
    return seq;
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) Sequence(org.deeplearning4j.models.sequencevectors.sequence.Sequence)

Example 38 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class Glove method train.

/**
     * Train on the corpus
     * @param rdd the rdd to train
     * @return the vocab and weights
     */
public Pair<VocabCache<VocabWord>, GloveWeightLookupTable> train(JavaRDD<String> rdd) throws Exception {
    // Each `train()` can use different parameters
    final JavaSparkContext sc = new JavaSparkContext(rdd.context());
    final SparkConf conf = sc.getConf();
    final int vectorLength = assignVar(VECTOR_LENGTH, conf, Integer.class);
    final boolean useAdaGrad = assignVar(ADAGRAD, conf, Boolean.class);
    final double negative = assignVar(NEGATIVE, conf, Double.class);
    final int numWords = assignVar(NUM_WORDS, conf, Integer.class);
    final int window = assignVar(WINDOW, conf, Integer.class);
    final double alpha = assignVar(ALPHA, conf, Double.class);
    final double minAlpha = assignVar(MIN_ALPHA, conf, Double.class);
    final int iterations = assignVar(ITERATIONS, conf, Integer.class);
    final int nGrams = assignVar(N_GRAMS, conf, Integer.class);
    final String tokenizer = assignVar(TOKENIZER, conf, String.class);
    final String tokenPreprocessor = assignVar(TOKEN_PREPROCESSOR, conf, String.class);
    final boolean removeStop = assignVar(REMOVE_STOPWORDS, conf, Boolean.class);
    Map<String, Object> tokenizerVarMap = new HashMap<String, Object>() {

        {
            put("numWords", numWords);
            put("nGrams", nGrams);
            put("tokenizer", tokenizer);
            put("tokenPreprocessor", tokenPreprocessor);
            put("removeStop", removeStop);
        }
    };
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(tokenizerVarMap);
    TextPipeline pipeline = new TextPipeline(rdd, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    // Get total word count
    Long totalWordCount = pipeline.getTotalWordCount();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    JavaRDD<Pair<List<String>, AtomicLong>> sentenceWordsCountRDD = pipeline.getSentenceWordsCountRDD();
    final Pair<VocabCache<VocabWord>, Long> vocabAndNumWords = new Pair<>(vocabCache, totalWordCount);
    vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst());
    final GloveWeightLookupTable gloveWeightLookupTable = new GloveWeightLookupTable.Builder().cache(vocabAndNumWords.getFirst()).lr(conf.getDouble(GlovePerformer.ALPHA, 0.01)).maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100)).vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300)).xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75)).build();
    gloveWeightLookupTable.resetWeights();
    gloveWeightLookupTable.getBiasAdaGrad().historicalGradient = Nd4j.ones(gloveWeightLookupTable.getSyn0().rows());
    gloveWeightLookupTable.getWeightAdaGrad().historicalGradient = Nd4j.ones(gloveWeightLookupTable.getSyn0().shape());
    log.info("Created lookup table of size " + Arrays.toString(gloveWeightLookupTable.getSyn0().shape()));
    CounterMap<String, String> coOccurrenceCounts = sentenceWordsCountRDD.map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize)).fold(new CounterMap<String, String>(), new CoOccurrenceCounts());
    Iterator<Pair<String, String>> pair2 = coOccurrenceCounts.getPairIterator();
    List<Triple<String, String, Double>> counts = new ArrayList<>();
    while (pair2.hasNext()) {
        Pair<String, String> next = pair2.next();
        if (coOccurrenceCounts.getCount(next.getFirst(), next.getSecond()) > gloveWeightLookupTable.getMaxCount()) {
            coOccurrenceCounts.setCount(next.getFirst(), next.getSecond(), gloveWeightLookupTable.getMaxCount());
        }
        counts.add(new Triple<>(next.getFirst(), next.getSecond(), coOccurrenceCounts.getCount(next.getFirst(), next.getSecond())));
    }
    log.info("Calculated co occurrences");
    JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts);
    JavaPairRDD<String, Tuple2<String, Double>> pairs = parallel.mapToPair(new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() {

        @Override
        public Tuple2<String, Tuple2<String, Double>> call(Triple<String, String, Double> stringStringDoubleTriple) throws Exception {
            return new Tuple2<>(stringStringDoubleTriple.getFirst(), new Tuple2<>(stringStringDoubleTriple.getSecond(), stringStringDoubleTriple.getThird()));
        }
    });
    JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab = pairs.mapToPair(new PairFunction<Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() {

        @Override
        public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call(Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception {
            VocabWord w1 = vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1());
            VocabWord w2 = vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1());
            return new Tuple2<>(w1, new Tuple2<>(w2, stringTuple2Tuple2._2()._2()));
        }
    });
    for (int i = 0; i < iterations; i++) {
        JavaRDD<GloveChange> change = pairsVocab.map(new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() {

            @Override
            public GloveChange call(Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2) throws Exception {
                VocabWord w1 = vocabWordTuple2Tuple2._1();
                VocabWord w2 = vocabWordTuple2Tuple2._2()._1();
                INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex());
                INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex());
                INDArray bias = gloveWeightLookupTable.getBias();
                double score = vocabWordTuple2Tuple2._2()._2();
                double xMax = gloveWeightLookupTable.getxMax();
                double maxCount = gloveWeightLookupTable.getMaxCount();
                //w1 * w2 + bias
                double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector);
                prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex());
                double weight = FastMath.pow(Math.min(1.0, (score / maxCount)), xMax);
                double fDiff = score > xMax ? prediction : weight * (prediction - Math.log(score));
                if (Double.isNaN(fDiff))
                    fDiff = Nd4j.EPS_THRESHOLD;
                //amount of change
                double gradient = fDiff;
                Pair<INDArray, Double> w1Update = update(gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w1, w1Vector, w2Vector, gradient);
                Pair<INDArray, Double> w2Update = update(gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w2, w2Vector, w1Vector, gradient);
                return new GloveChange(w1, w2, w1Update.getFirst(), w2Update.getFirst(), w1Update.getSecond(), w2Update.getSecond(), fDiff, gloveWeightLookupTable.getWeightAdaGrad().getHistoricalGradient().slice(w1.getIndex()), gloveWeightLookupTable.getWeightAdaGrad().getHistoricalGradient().slice(w2.getIndex()), gloveWeightLookupTable.getBiasAdaGrad().getHistoricalGradient().getDouble(w2.getIndex()), gloveWeightLookupTable.getBiasAdaGrad().getHistoricalGradient().getDouble(w1.getIndex()));
            }
        });
        List<GloveChange> gloveChanges = change.collect();
        double error = 0.0;
        for (GloveChange change2 : gloveChanges) {
            change2.apply(gloveWeightLookupTable);
            error += change2.getError();
        }
        List l = pairsVocab.collect();
        Collections.shuffle(l);
        pairsVocab = sc.parallelizePairs(l);
        log.info("Error at iteration " + i + " was " + error);
    }
    return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable);
}
Also used : CoOccurrenceCounts(org.deeplearning4j.spark.models.embeddings.glove.cooccurrences.CoOccurrenceCounts) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) Triple(org.deeplearning4j.berkeley.Triple) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) AtomicLong(java.util.concurrent.atomic.AtomicLong) CounterMap(org.deeplearning4j.berkeley.CounterMap) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) CoOccurrenceCalculator(org.deeplearning4j.spark.models.embeddings.glove.cooccurrences.CoOccurrenceCalculator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Pair(org.deeplearning4j.berkeley.Pair) GloveWeightLookupTable(org.deeplearning4j.models.glove.GloveWeightLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) SparkConf(org.apache.spark.SparkConf)

Example 39 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class CoOccurrenceCalculator method call.

@Override
public CounterMap<String, String> call(Pair<List<String>, AtomicLong> pair) throws Exception {
    List<String> sentence = pair.getFirst();
    CounterMap<String, String> coOCurreneCounts = new CounterMap<>();
    VocabCache vocab = this.vocab.value();
    for (int i = 0; i < sentence.size(); i++) {
        int wordIdx = vocab.indexOf(sentence.get(i));
        String w1 = ((VocabWord) vocab.wordFor(sentence.get(i))).getWord();
        if (// || w1.equals(Glove.UNK))
        wordIdx < 0)
            continue;
        int windowStop = Math.min(i + windowSize + 1, sentence.size());
        for (int j = i; j < windowStop; j++) {
            int otherWord = vocab.indexOf(sentence.get(j));
            String w2 = ((VocabWord) vocab.wordFor(sentence.get(j))).getWord();
            if (// || w2.equals(Glove.UNK))
            vocab.indexOf(sentence.get(j)) < 0)
                continue;
            if (otherWord == wordIdx)
                continue;
            if (wordIdx < otherWord) {
                coOCurreneCounts.incrementCount(sentence.get(i), sentence.get(j), 1.0 / (j - i + Nd4j.EPS_THRESHOLD));
                if (symmetric)
                    coOCurreneCounts.incrementCount(sentence.get(j), sentence.get(i), 1.0 / (j - i + Nd4j.EPS_THRESHOLD));
            } else {
                coOCurreneCounts.incrementCount(sentence.get(j), sentence.get(i), 1.0 / (j - i + Nd4j.EPS_THRESHOLD));
                if (symmetric)
                    coOCurreneCounts.incrementCount(sentence.get(i), sentence.get(j), 1.0 / (j - i + Nd4j.EPS_THRESHOLD));
            }
        }
    }
    return coOCurreneCounts;
}
Also used : CounterMap(org.deeplearning4j.berkeley.CounterMap) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Example 40 with VocabWord

use of org.deeplearning4j.models.word2vec.VocabWord in project deeplearning4j by deeplearning4j.

the class FirstIterationFunctionAdapter method trainSentence.

public void trainSentence(List<VocabWord> vocabWordsList, double currentSentenceAlpha) {
    if (vocabWordsList != null && !vocabWordsList.isEmpty()) {
        for (int ithWordInSentence = 0; ithWordInSentence < vocabWordsList.size(); ithWordInSentence++) {
            // Random value ranging from 0 to window size
            nextRandom.set(Math.abs(nextRandom.get() * 25214903917L + 11));
            int b = (int) (long) this.nextRandom.get() % window;
            VocabWord currentWord = vocabWordsList.get(ithWordInSentence);
            if (currentWord != null) {
                skipGram(ithWordInSentence, vocabWordsList, b, currentSentenceAlpha);
            }
        }
    }
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord)

Aggregations

VocabWord (org.deeplearning4j.models.word2vec.VocabWord)110 Test (org.junit.Test)54 INDArray (org.nd4j.linalg.api.ndarray.INDArray)31 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)26 ClassPathResource (org.datavec.api.util.ClassPathResource)23 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)22 File (java.io.File)20 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)19 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)19 ArrayList (java.util.ArrayList)17 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)17 CommonPreprocessor (org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor)15 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)14 AbstractSequenceIterator (org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator)13 SentenceTransformer (org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)12 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)11 Word2Vec (org.deeplearning4j.models.word2vec.Word2Vec)11 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)10