use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testCountCumSum.
@Test
public void testCountCumSum() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
pipeline.buildVocabWordListRDD();
JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
List<Long> sentenceCountCumSumList = sentenceCountCumSumRDD.collect();
assertTrue(sentenceCountCumSumList.get(0) == 6L);
assertTrue(sentenceCountCumSumList.get(1) == 9L);
sc.stop();
}
use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testHuffman.
@Test
public void testHuffman() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
Huffman huffman = new Huffman(vocabCache.vocabWords());
huffman.build();
huffman.applyIndexes(vocabCache);
Collection<VocabWord> vocabWords = vocabCache.vocabWords();
System.out.println("Huffman Test:");
for (VocabWord vocabWord : vocabWords) {
System.out.println("Word: " + vocabWord);
System.out.println(vocabWord.getCodes());
System.out.println(vocabWord.getPoints());
}
sc.stop();
}
use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testWordFreqAccIdentifyingStopWords.
@Test
public void testWordFreqAccIdentifyingStopWords() throws Exception {
JavaSparkContext sc = getContext();
// word2vec.setRemoveStop(false);
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);
Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
assertEquals(wordFreqCounter.getCount("is"), 0, 0);
assertEquals(wordFreqCounter.getCount("this"), 0, 0);
assertEquals(wordFreqCounter.getCount("are"), 0, 0);
assertEquals(wordFreqCounter.getCount("a"), 0, 0);
assertEquals(wordFreqCounter.getCount("STOP"), 4, 0);
assertEquals(wordFreqCounter.getCount("strange"), 2, 0);
assertEquals(wordFreqCounter.getCount("flowers"), 1, 0);
assertEquals(wordFreqCounter.getCount("world"), 1, 0);
assertEquals(wordFreqCounter.getCount("red"), 1, 0);
sc.stop();
}
use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testBuildVocabCache.
@Test
public void testBuildVocabCache() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
assertTrue(vocabCache != null);
log.info("VocabWords: " + vocabCache.words());
assertEquals(5, vocabCache.numWords());
VocabWord redVocab = vocabCache.tokenFor("red");
VocabWord flowerVocab = vocabCache.tokenFor("flowers");
VocabWord worldVocab = vocabCache.tokenFor("world");
VocabWord strangeVocab = vocabCache.tokenFor("strange");
log.info("Red word: " + redVocab);
log.info("Flower word: " + flowerVocab);
log.info("World word: " + worldVocab);
log.info("Strange word: " + strangeVocab);
assertEquals(redVocab.getWord(), "red");
assertEquals(redVocab.getElementFrequency(), 1, 0);
assertEquals(flowerVocab.getWord(), "flowers");
assertEquals(flowerVocab.getElementFrequency(), 1, 0);
assertEquals(worldVocab.getWord(), "world");
assertEquals(worldVocab.getElementFrequency(), 1, 0);
assertEquals(strangeVocab.getWord(), "strange");
assertEquals(strangeVocab.getElementFrequency(), 2, 0);
sc.stop();
}
use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testFilterMinWordAddVocab.
@Test
public void testFilterMinWordAddVocab() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);
Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
pipeline.filterMinWordAddVocab(wordFreqCounter);
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
assertTrue(vocabCache != null);
VocabWord redVocab = vocabCache.tokenFor("red");
VocabWord flowerVocab = vocabCache.tokenFor("flowers");
VocabWord worldVocab = vocabCache.tokenFor("world");
VocabWord strangeVocab = vocabCache.tokenFor("strange");
assertEquals(redVocab.getWord(), "red");
assertEquals(redVocab.getElementFrequency(), 1, 0);
assertEquals(flowerVocab.getWord(), "flowers");
assertEquals(flowerVocab.getElementFrequency(), 1, 0);
assertEquals(worldVocab.getWord(), "world");
assertEquals(worldVocab.getElementFrequency(), 1, 0);
assertEquals(strangeVocab.getWord(), "strange");
assertEquals(strangeVocab.getElementFrequency(), 2, 0);
sc.stop();
}
Aggregations