use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testWordFreqAccIdentifyingStopWords.
@Test
public void testWordFreqAccIdentifyingStopWords() throws Exception {
JavaSparkContext sc = getContext();
// word2vec.setRemoveStop(false);
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);
Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
assertEquals(wordFreqCounter.getCount("is"), 0, 0);
assertEquals(wordFreqCounter.getCount("this"), 0, 0);
assertEquals(wordFreqCounter.getCount("are"), 0, 0);
assertEquals(wordFreqCounter.getCount("a"), 0, 0);
assertEquals(wordFreqCounter.getCount("STOP"), 4, 0);
assertEquals(wordFreqCounter.getCount("strange"), 2, 0);
assertEquals(wordFreqCounter.getCount("flowers"), 1, 0);
assertEquals(wordFreqCounter.getCount("world"), 1, 0);
assertEquals(wordFreqCounter.getCount("red"), 1, 0);
sc.stop();
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testBuildVocabCache.
@Test
public void testBuildVocabCache() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
assertTrue(vocabCache != null);
log.info("VocabWords: " + vocabCache.words());
assertEquals(5, vocabCache.numWords());
VocabWord redVocab = vocabCache.tokenFor("red");
VocabWord flowerVocab = vocabCache.tokenFor("flowers");
VocabWord worldVocab = vocabCache.tokenFor("world");
VocabWord strangeVocab = vocabCache.tokenFor("strange");
log.info("Red word: " + redVocab);
log.info("Flower word: " + flowerVocab);
log.info("World word: " + worldVocab);
log.info("Strange word: " + strangeVocab);
assertEquals(redVocab.getWord(), "red");
assertEquals(redVocab.getElementFrequency(), 1, 0);
assertEquals(flowerVocab.getWord(), "flowers");
assertEquals(flowerVocab.getElementFrequency(), 1, 0);
assertEquals(worldVocab.getWord(), "world");
assertEquals(worldVocab.getElementFrequency(), 1, 0);
assertEquals(strangeVocab.getWord(), "strange");
assertEquals(strangeVocab.getElementFrequency(), 2, 0);
sc.stop();
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TestSparkComputationGraph method testBasic.
@Test
public void testBasic() throws Exception {
JavaSparkContext sc = this.sc;
RecordReader rr = new CSVRecordReader(0, ",");
rr.initialize(new FileSplit(new ClassPathResource("iris.txt").getTempFileFromArchive()));
MultiDataSetIterator iter = new RecordReaderMultiDataSetIterator.Builder(1).addReader("iris", rr).addInput("iris", 0, 3).addOutputOneHot("iris", 4, 3).build();
List<MultiDataSet> list = new ArrayList<>(150);
while (iter.hasNext()) list.add(iter.next());
ComputationGraphConfiguration config = new NeuralNetConfiguration.Builder().optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).learningRate(0.1).graphBuilder().addInputs("in").addLayer("dense", new DenseLayer.Builder().nIn(4).nOut(2).build(), "in").addLayer("out", new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(2).nOut(3).build(), "dense").setOutputs("out").pretrain(false).backprop(true).build();
ComputationGraph cg = new ComputationGraph(config);
cg.init();
TrainingMaster tm = new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0);
SparkComputationGraph scg = new SparkComputationGraph(sc, cg, tm);
scg.setListeners(Collections.singleton((IterationListener) new ScoreIterationListener(1)));
JavaRDD<MultiDataSet> rdd = sc.parallelize(list);
scg.fitMultiDataSet(rdd);
//Try: fitting using DataSet
DataSetIterator iris = new IrisDataSetIterator(1, 150);
List<DataSet> list2 = new ArrayList<>();
while (iris.hasNext()) list2.add(iris.next());
JavaRDD<DataSet> rddDS = sc.parallelize(list2);
scg.fit(rddDS);
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TestCompareParameterAveragingSparkVsSingleMachine method testOneExecutor.
@Test
public void testOneExecutor() {
//Idea: single worker/executor on Spark should give identical results to a single machine
int miniBatchSize = 10;
int nWorkers = 1;
for (boolean saveUpdater : new boolean[] { true, false }) {
JavaSparkContext sc = getContext(nWorkers);
try {
//Do training locally, for 3 minibatches
int[] seeds = { 1, 2, 3 };
MultiLayerNetwork net = new MultiLayerNetwork(getConf(12345, Updater.RMSPROP));
net.init();
INDArray initialParams = net.params().dup();
for (int i = 0; i < seeds.length; i++) {
DataSet ds = getOneDataSet(miniBatchSize, seeds[i]);
if (!saveUpdater)
net.setUpdater(null);
net.fit(ds);
}
INDArray finalParams = net.params().dup();
//Do training on Spark with one executor, for 3 separate minibatches
TrainingMaster tm = getTrainingMaster(1, miniBatchSize, saveUpdater);
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, getConf(12345, Updater.RMSPROP), tm);
sparkNet.setCollectTrainingStats(true);
INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
for (int i = 0; i < seeds.length; i++) {
List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSize, seeds[i]);
JavaRDD<DataSet> rdd = sc.parallelize(list);
sparkNet.fit(rdd);
}
INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
assertEquals(initialParams, initialSparkParams);
assertNotEquals(initialParams, finalParams);
assertEquals(finalParams, finalSparkParams);
} finally {
sc.stop();
}
}
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TestCompareParameterAveragingSparkVsSingleMachine method testAverageEveryStepGraph.
@Test
public void testAverageEveryStepGraph() {
//Idea: averaging every step with SGD (SGD updater + optimizer) is mathematically identical to doing the learning
// on a single machine for synchronous distributed training
//BUT: This is *ONLY* the case if all workers get an identical number of examples. This won't be the case if
// we use RDD.randomSplit (which is what occurs if we use .fit(JavaRDD<DataSet> on a data set that needs splitting),
// which might give a number of examples that isn't divisible by number of workers (like 39 examples on 4 executors)
//This is also ONLY the case using SGD updater
int miniBatchSizePerWorker = 10;
int nWorkers = 4;
for (boolean saveUpdater : new boolean[] { true, false }) {
JavaSparkContext sc = getContext(nWorkers);
try {
//Do training locally, for 3 minibatches
int[] seeds = { 1, 2, 3 };
// CudaGridExecutioner executioner = (CudaGridExecutioner) Nd4j.getExecutioner();
ComputationGraph net = new ComputationGraph(getGraphConf(12345, Updater.SGD));
net.init();
INDArray initialParams = net.params().dup();
for (int i = 0; i < seeds.length; i++) {
DataSet ds = getOneDataSet(miniBatchSizePerWorker * nWorkers, seeds[i]);
if (!saveUpdater)
net.setUpdater(null);
net.fit(ds);
}
INDArray finalParams = net.params().dup();
// executioner.addToWatchdog(finalParams, "finalParams");
//Do training on Spark with one executor, for 3 separate minibatches
TrainingMaster tm = getTrainingMaster(1, miniBatchSizePerWorker, saveUpdater);
SparkComputationGraph sparkNet = new SparkComputationGraph(sc, getGraphConf(12345, Updater.SGD), tm);
sparkNet.setCollectTrainingStats(true);
INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
for (int i = 0; i < seeds.length; i++) {
List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSizePerWorker * nWorkers, seeds[i]);
JavaRDD<DataSet> rdd = sc.parallelize(list);
sparkNet.fit(rdd);
}
System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
// executioner.addToWatchdog(finalSparkParams, "finalSparkParams");
float[] fp = finalParams.data().asFloat();
float[] fps = finalSparkParams.data().asFloat();
System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat()));
System.out.println("Initial (Spark) params: " + Arrays.toString(initialSparkParams.data().asFloat()));
System.out.println("Final (Local) params: " + Arrays.toString(fp));
System.out.println("Final (Spark) params: " + Arrays.toString(fps));
assertEquals(initialParams, initialSparkParams);
assertNotEquals(initialParams, finalParams);
assertArrayEquals(fp, fps, 1e-5f);
double sparkScore = sparkNet.getScore();
assertTrue(sparkScore > 0.0);
assertEquals(net.score(), sparkScore, 1e-3);
} finally {
sc.stop();
}
}
}
Aggregations