use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.
the class BinaryCoOccurrenceReaderTest method testHasMoreObjects1.
@Test
public void testHasMoreObjects1() throws Exception {
File tempFile = File.createTempFile("tmp", "tmp");
tempFile.deleteOnExit();
VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
VocabWord word1 = new VocabWord(1.0, "human");
VocabWord word2 = new VocabWord(2.0, "animal");
VocabWord word3 = new VocabWord(3.0, "unknown");
vocabCache.addToken(word1);
vocabCache.addToken(word2);
vocabCache.addToken(word3);
Huffman huffman = new Huffman(vocabCache.vocabWords());
huffman.build();
huffman.applyIndexes(vocabCache);
BinaryCoOccurrenceWriter<VocabWord> writer = new BinaryCoOccurrenceWriter<>(tempFile);
CoOccurrenceWeight<VocabWord> object1 = new CoOccurrenceWeight<>();
object1.setElement1(word1);
object1.setElement2(word2);
object1.setWeight(3.14159265);
writer.writeObject(object1);
CoOccurrenceWeight<VocabWord> object2 = new CoOccurrenceWeight<>();
object2.setElement1(word2);
object2.setElement2(word3);
object2.setWeight(0.197);
writer.writeObject(object2);
writer.finish();
BinaryCoOccurrenceReader<VocabWord> reader = new BinaryCoOccurrenceReader<>(tempFile, vocabCache, null);
CoOccurrenceWeight<VocabWord> r1 = reader.nextObject();
log.info("Object received: " + r1);
assertNotEquals(null, r1);
r1 = reader.nextObject();
log.info("Object received: " + r1);
assertNotEquals(null, r1);
}
use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.
the class AbstractCacheTest method testHuffman.
@Test
public void testHuffman() throws Exception {
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
cache.addToken(new VocabWord(1.0, "word"));
cache.addToken(new VocabWord(2.0, "test"));
cache.addToken(new VocabWord(3.0, "tester"));
assertEquals(3, cache.numWords());
Huffman huffman = new Huffman(cache.tokens());
huffman.build();
huffman.applyIndexes(cache);
assertEquals("tester", cache.wordAtIndex(0));
assertEquals("test", cache.wordAtIndex(1));
assertEquals("word", cache.wordAtIndex(2));
VocabWord word = cache.tokenFor("tester");
assertEquals(0, word.getIndex());
}
use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testSyn0AfterFirstIteration.
@Test
public void testSyn0AfterFirstIteration() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
// word2vec.setRemoveStop(false);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
pipeline.buildVocabWordListRDD();
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
Huffman huffman = new Huffman(vocabCache.vocabWords());
huffman.build();
// Get total word count and put into word2vec variable map
Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap();
word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount());
double[] expTable = word2vec.getExpTable();
JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap);
Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable);
FirstIterationFunction firstIterationFunction = new FirstIterationFunction(word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache());
JavaRDD<Pair<VocabWord, INDArray>> pointSyn0Vec = vocabWordListSentenceCumSumRDD.mapPartitions(firstIterationFunction).map(new MapToPairFunction());
}
use of org.deeplearning4j.models.word2vec.Huffman in project deeplearning4j by deeplearning4j.
the class TextPipeline method buildVocabCache.
public void buildVocabCache() {
// Tokenize
JavaRDD<List<String>> tokenizedRDD = tokenize();
// Update accumulator values and map to an RDD of sentence counts
sentenceWordsCountRDD = updateAndReturnAccumulatorVal(tokenizedRDD).cache();
// Get value from accumulator
Counter<String> wordFreqCounter = wordFreqAcc.value();
// Filter out low count words and add to vocab cache object and feed into LookupCache
filterMinWordAddVocab(wordFreqCounter);
// huffman tree should be built BEFORE vocab broadcast
Huffman huffman = new Huffman(vocabCache.vocabWords());
huffman.build();
huffman.applyIndexes(vocabCache);
// At this point the vocab cache is built. Broadcast vocab cache
vocabCacheBroadcast = sc.broadcast(vocabCache);
}
Aggregations