Search in sources :

Example 11 with TextPipeline

use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testTokenizer.

@Test
public void testTokenizer() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
    assertEquals(2, tokenizedRDD.count());
    assertEquals(Arrays.asList("this", "is", "a", "strange", "strange", "world"), tokenizedRDD.first());
    sc.stop();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) Test(org.junit.Test)

Example 12 with TextPipeline

use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testWordFreqAccIdentifyStopWords.

@Test
public void testWordFreqAccIdentifyStopWords() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
    JavaRDD<Pair<List<String>, AtomicLong>> sentenceWordsCountRDD = pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);
    Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
    assertEquals(wordFreqCounter.getCount("STOP"), 4, 0);
    assertEquals(wordFreqCounter.getCount("strange"), 2, 0);
    assertEquals(wordFreqCounter.getCount("flowers"), 1, 0);
    assertEquals(wordFreqCounter.getCount("world"), 1, 0);
    assertEquals(wordFreqCounter.getCount("red"), 1, 0);
    List<Pair<List<String>, AtomicLong>> ret = sentenceWordsCountRDD.collect();
    assertEquals(ret.get(0).getFirst(), Arrays.asList("this", "is", "a", "strange", "strange", "world"));
    assertEquals(ret.get(1).getFirst(), Arrays.asList("flowers", "are", "red"));
    assertEquals(ret.get(0).getSecond().get(), 6);
    assertEquals(ret.get(1).getSecond().get(), 3);
    sc.stop();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) Pair(org.deeplearning4j.berkeley.Pair) Test(org.junit.Test)

Example 13 with TextPipeline

use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testWordFreqAccNotIdentifyingStopWords.

@Test
public void testWordFreqAccNotIdentifyingStopWords() throws Exception {
    JavaSparkContext sc = getContext();
    //  word2vec.setRemoveStop(false);
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
    pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);
    Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
    assertEquals(wordFreqCounter.getCount("is"), 1, 0);
    assertEquals(wordFreqCounter.getCount("this"), 1, 0);
    assertEquals(wordFreqCounter.getCount("are"), 1, 0);
    assertEquals(wordFreqCounter.getCount("a"), 1, 0);
    assertEquals(wordFreqCounter.getCount("strange"), 2, 0);
    assertEquals(wordFreqCounter.getCount("flowers"), 1, 0);
    assertEquals(wordFreqCounter.getCount("world"), 1, 0);
    assertEquals(wordFreqCounter.getCount("red"), 1, 0);
    sc.stop();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) Test(org.junit.Test)

Example 14 with TextPipeline

use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testSyn0AfterFirstIteration.

@Test
public void testSyn0AfterFirstIteration() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    // Get total word count and put into word2vec variable map
    Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap();
    word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount());
    double[] expTable = word2vec.getExpTable();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
    Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap);
    Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable);
    FirstIterationFunction firstIterationFunction = new FirstIterationFunction(word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache());
    JavaRDD<Pair<VocabWord, INDArray>> pointSyn0Vec = vocabWordListSentenceCumSumRDD.mapPartitions(firstIterationFunction).map(new MapToPairFunction());
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) MapToPairFunction(org.deeplearning4j.spark.models.embeddings.word2vec.MapToPairFunction) FirstIterationFunction(org.deeplearning4j.spark.models.embeddings.word2vec.FirstIterationFunction) CountCumSum(org.deeplearning4j.spark.text.functions.CountCumSum) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Pair(org.deeplearning4j.berkeley.Pair) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) AtomicLong(java.util.concurrent.atomic.AtomicLong) Huffman(org.deeplearning4j.models.word2vec.Huffman) AtomicLong(java.util.concurrent.atomic.AtomicLong) Test(org.junit.Test)

Example 15 with TextPipeline

use of org.deeplearning4j.spark.text.functions.TextPipeline in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testFirstIteration.

@Test
public void testFirstIteration() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    // word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    /*        Huffman huffman = new Huffman(vocabCache.vocabWords());
        huffman.build();
        huffman.applyIndexes(vocabCache);
        */
    VocabWord token = vocabCache.tokenFor("strange");
    VocabWord word = vocabCache.wordFor("strange");
    log.info("Strange token: " + token);
    log.info("Strange word: " + word);
    // Get total word count and put into word2vec variable map
    Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap();
    word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount());
    double[] expTable = word2vec.getExpTable();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
    Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap);
    Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable);
    Iterator<Tuple2<List<VocabWord>, Long>> iterator = vocabWordListSentenceCumSumRDD.collect().iterator();
    FirstIterationFunctionAdapter firstIterationFunction = new FirstIterationFunctionAdapter(word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache());
    Iterable<Map.Entry<VocabWord, INDArray>> ret = firstIterationFunction.call(iterator);
    assertTrue(ret.iterator().hasNext());
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) CountCumSum(org.deeplearning4j.spark.text.functions.CountCumSum) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) AtomicLong(java.util.concurrent.atomic.AtomicLong) Tuple2(scala.Tuple2) FirstIterationFunctionAdapter(org.deeplearning4j.spark.models.embeddings.word2vec.FirstIterationFunctionAdapter) AtomicLong(java.util.concurrent.atomic.AtomicLong) Test(org.junit.Test)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)15 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 Test (org.junit.Test)13 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)10 AtomicLong (java.util.concurrent.atomic.AtomicLong)8 CountCumSum (org.deeplearning4j.spark.text.functions.CountCumSum)6 Pair (org.deeplearning4j.berkeley.Pair)4 Tuple2 (scala.Tuple2)4 Huffman (org.deeplearning4j.models.word2vec.Huffman)2 VocabCache (org.deeplearning4j.models.word2vec.wordstore.VocabCache)2 INDArray (org.nd4j.linalg.api.ndarray.INDArray)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 SparkConf (org.apache.spark.SparkConf)1 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)1 CounterMap (org.deeplearning4j.berkeley.CounterMap)1 Triple (org.deeplearning4j.berkeley.Triple)1