use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class Word2VecTest method testSparkW2VonBiggerCorpus.
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest").set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g").set("spark.executor.memory", "8g");
// Set SparkContext
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// Path of data part-00000
//String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath();
// String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
// Read in data
JavaRDD<String> corpus = sc.textFile(dataPath);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new LowCasePreProcessor());
Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).useUnknown(true).build();
word2Vec.train(corpus);
sc.stop();
WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class Word2VecTest method testConcepts.
@Test
public void testConcepts() throws Exception {
// These are all default values for word2vec
SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest");
// Set SparkContext
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// Path of data part-00000
String dataPath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
// dataPath = "/ext/Temp/part-00000";
// String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
// Read in data
JavaRDD<String> corpus = sc.textFile(dataPath);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).stopWords(Arrays.asList("three")).useUnknown(true).build();
word2Vec.train(corpus);
//word2Vec.setModelUtils(new FlatModelUtils());
System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK"));
InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable();
double sim = word2Vec.similarity("day", "night");
System.out.println("day/night similarity: " + sim);
/*
System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce"));
System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro"));
Collection<String> portu = word2Vec.wordsNearest("carro", 10);
printWords("carro", portu, word2Vec);
portu = word2Vec.wordsNearest("davi", 10);
printWords("davi", portu, word2Vec);
System.out.println("---------------------------------------");
*/
Collection<String> words = word2Vec.wordsNearest("day", 10);
printWords("day", words, word2Vec);
assertTrue(words.contains("night"));
assertTrue(words.contains("week"));
assertTrue(words.contains("year"));
sim = word2Vec.similarity("two", "four");
System.out.println("two/four similarity: " + sim);
words = word2Vec.wordsNearest("two", 10);
printWords("two", words, word2Vec);
// three should be absent due to stopWords
assertFalse(words.contains("three"));
assertTrue(words.contains("five"));
assertTrue(words.contains("four"));
sc.stop();
// test serialization
File tempFile = File.createTempFile("temp", "tmp");
tempFile.deleteOnExit();
int idx1 = word2Vec.vocab().wordFor("day").getIndex();
INDArray array1 = word2Vec.getWordVectorMatrix("day").dup();
VocabWord word1 = word2Vec.vocab().elementAtIndex(0);
WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile);
WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);
VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0);
VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it");
int idx2 = vectors.vocab().wordFor("day").getIndex();
INDArray array2 = vectors.getWordVectorMatrix("day").dup();
System.out.println("word 'i': " + word2);
System.out.println("word 'it': " + wordIT);
assertEquals(idx1, idx2);
assertEquals(word1, word2);
assertEquals(array1, array2);
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class SparkSequenceVectorsTest method setUp.
@Before
public void setUp() throws Exception {
if (sequencesCyclic == null) {
sequencesCyclic = new ArrayList<>();
// 10 sequences in total
for (int с = 0; с < 10; с++) {
Sequence<VocabWord> sequence = new Sequence<>();
for (int e = 0; e < 10; e++) {
// we will have 9 equal elements, with total frequency of 10
sequence.addElement(new VocabWord(1.0, "" + e, (long) e));
}
// and 1 element with frequency of 20
sequence.addElement(new VocabWord(1.0, "0", 0L));
sequencesCyclic.add(sequence);
}
}
SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("SeqVecTests");
sc = new JavaSparkContext(sparkConf);
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class BaseSparkTest method getContext.
/**
*
* @return
*/
public JavaSparkContext getContext() {
if (sc != null)
return sc;
// set to test mode
SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("sparktest").set(Word2VecVariables.NUM_WORDS, String.valueOf(1));
sc = new JavaSparkContext(sparkConf);
return sc;
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testBuildVocabWordListRDD.
@Test
public void testBuildVocabWordListRDD() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
pipeline.buildVocabWordListRDD();
JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
List<List<VocabWord>> vocabWordList = vocabWordListRDD.collect();
List<VocabWord> firstSentenceVocabList = vocabWordList.get(0);
List<VocabWord> secondSentenceVocabList = vocabWordList.get(1);
System.out.println(Arrays.deepToString(firstSentenceVocabList.toArray()));
List<String> firstSentenceTokenList = new ArrayList<>();
List<String> secondSentenceTokenList = new ArrayList<>();
for (VocabWord v : firstSentenceVocabList) {
if (v != null) {
firstSentenceTokenList.add(v.getWord());
}
}
for (VocabWord v : secondSentenceVocabList) {
if (v != null) {
secondSentenceTokenList.add(v.getWord());
}
}
assertEquals(pipeline.getTotalWordCount(), 9, 0);
assertEquals(sentenceCountRDD.collect().get(0).get(), 6);
assertEquals(sentenceCountRDD.collect().get(1).get(), 3);
assertTrue(firstSentenceTokenList.containsAll(Arrays.asList("strange", "strange", "world")));
assertTrue(secondSentenceTokenList.containsAll(Arrays.asList("flowers", "red")));
sc.stop();
}
Aggregations