Search in sources :

Example 6 with Huffman

use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.

the class BinaryCoOccurrenceReaderTest method testHasMoreObjects1.

@Test
public void testHasMoreObjects1() throws Exception {
    File tempFile = File.createTempFile("tmp", "tmp");
    tempFile.deleteOnExit();
    VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
    VocabWord word1 = new VocabWord(1.0, "human");
    VocabWord word2 = new VocabWord(2.0, "animal");
    VocabWord word3 = new VocabWord(3.0, "unknown");
    vocabCache.addToken(word1);
    vocabCache.addToken(word2);
    vocabCache.addToken(word3);
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    huffman.applyIndexes(vocabCache);
    BinaryCoOccurrenceWriter<VocabWord> writer = new BinaryCoOccurrenceWriter<>(tempFile);
    CoOccurrenceWeight<VocabWord> object1 = new CoOccurrenceWeight<>();
    object1.setElement1(word1);
    object1.setElement2(word2);
    object1.setWeight(3.14159265);
    writer.writeObject(object1);
    CoOccurrenceWeight<VocabWord> object2 = new CoOccurrenceWeight<>();
    object2.setElement1(word2);
    object2.setElement2(word3);
    object2.setWeight(0.197);
    writer.writeObject(object2);
    writer.finish();
    BinaryCoOccurrenceReader<VocabWord> reader = new BinaryCoOccurrenceReader<>(tempFile, vocabCache, null);
    CoOccurrenceWeight<VocabWord> r1 = reader.nextObject();
    log.info("Object received: " + r1);
    assertNotEquals(null, r1);
    r1 = reader.nextObject();
    log.info("Object received: " + r1);
    assertNotEquals(null, r1);
}
Also used : Huffman(org.deeplearning4j.models.word2vec.Huffman) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) File(java.io.File) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) Test(org.junit.Test)

Example 7 with Huffman

use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.

the class AbstractCacheTest method testHuffman.

@Test
public void testHuffman() throws Exception {
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
    cache.addToken(new VocabWord(1.0, "word"));
    cache.addToken(new VocabWord(2.0, "test"));
    cache.addToken(new VocabWord(3.0, "tester"));
    assertEquals(3, cache.numWords());
    Huffman huffman = new Huffman(cache.tokens());
    huffman.build();
    huffman.applyIndexes(cache);
    assertEquals("tester", cache.wordAtIndex(0));
    assertEquals("test", cache.wordAtIndex(1));
    assertEquals("word", cache.wordAtIndex(2));
    VocabWord word = cache.tokenFor("tester");
    assertEquals(0, word.getIndex());
}
Also used : Huffman(org.deeplearning4j.models.word2vec.Huffman) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) Test(org.junit.Test)

Example 8 with Huffman

use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testSyn0AfterFirstIteration.

@Test
public void testSyn0AfterFirstIteration() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    // Get total word count and put into word2vec variable map
    Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap();
    word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount());
    double[] expTable = word2vec.getExpTable();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
    Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap);
    Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable);
    FirstIterationFunction firstIterationFunction = new FirstIterationFunction(word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache());
    JavaRDD<Pair<VocabWord, INDArray>> pointSyn0Vec = vocabWordListSentenceCumSumRDD.mapPartitions(firstIterationFunction).map(new MapToPairFunction());
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) MapToPairFunction(org.deeplearning4j.spark.models.embeddings.word2vec.MapToPairFunction) FirstIterationFunction(org.deeplearning4j.spark.models.embeddings.word2vec.FirstIterationFunction) CountCumSum(org.deeplearning4j.spark.text.functions.CountCumSum) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Pair(org.deeplearning4j.berkeley.Pair) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) AtomicLong(java.util.concurrent.atomic.AtomicLong) Huffman(org.deeplearning4j.models.word2vec.Huffman) AtomicLong(java.util.concurrent.atomic.AtomicLong) Test(org.junit.Test)

Example 9 with Huffman

use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.

the class TextPipeline method buildVocabCache.

public void buildVocabCache() {
    // Tokenize
    JavaRDD<List<String>> tokenizedRDD = tokenize();
    // Update accumulator values and map to an RDD of sentence counts
    sentenceWordsCountRDD = updateAndReturnAccumulatorVal(tokenizedRDD).cache();
    // Get value from accumulator
    Counter<String> wordFreqCounter = wordFreqAcc.value();
    // Filter out low count words and add to vocab cache object and feed into LookupCache
    filterMinWordAddVocab(wordFreqCounter);
    // huffman tree should be built BEFORE vocab broadcast
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    huffman.applyIndexes(vocabCache);
    // At this point the vocab cache is built. Broadcast vocab cache
    vocabCacheBroadcast = sc.broadcast(vocabCache);
}
Also used : Huffman(org.deeplearning4j.models.word2vec.Huffman) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

Huffman (org.deeplearning4j.models.word2vec.Huffman)9 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)5 Test (org.junit.Test)5 AbstractCache (org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache)4 File (java.io.File)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)2 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Pair (org.deeplearning4j.berkeley.Pair)1 ShallowSequenceElement (org.deeplearning4j.models.sequencevectors.sequence.ShallowSequenceElement)1 FirstIterationFunction (org.deeplearning4j.spark.models.embeddings.word2vec.FirstIterationFunction)1 MapToPairFunction (org.deeplearning4j.spark.models.embeddings.word2vec.MapToPairFunction)1 CountCumSum (org.deeplearning4j.spark.text.functions.CountCumSum)1