use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TestCompareParameterAveragingSparkVsSingleMachine method testAverageEveryStepGraphCNN.
@Test
public void testAverageEveryStepGraphCNN() {
//Idea: averaging every step with SGD (SGD updater + optimizer) is mathematically identical to doing the learning
// on a single machine for synchronous distributed training
//BUT: This is *ONLY* the case if all workers get an identical number of examples. This won't be the case if
// we use RDD.randomSplit (which is what occurs if we use .fit(JavaRDD<DataSet> on a data set that needs splitting),
// which might give a number of examples that isn't divisible by number of workers (like 39 examples on 4 executors)
//This is also ONLY the case using SGD updater
int miniBatchSizePerWorker = 10;
int nWorkers = 4;
for (boolean saveUpdater : new boolean[] { true, false }) {
JavaSparkContext sc = getContext(nWorkers);
try {
//Do training locally, for 3 minibatches
int[] seeds = { 1, 2, 3 };
ComputationGraph net = new ComputationGraph(getGraphConfCNN(12345, Updater.SGD));
net.init();
INDArray initialParams = net.params().dup();
for (int i = 0; i < seeds.length; i++) {
DataSet ds = getOneDataSetCNN(miniBatchSizePerWorker * nWorkers, seeds[i]);
if (!saveUpdater)
net.setUpdater(null);
net.fit(ds);
}
INDArray finalParams = net.params().dup();
//Do training on Spark with one executor, for 3 separate minibatches
TrainingMaster tm = getTrainingMaster(1, miniBatchSizePerWorker, saveUpdater);
SparkComputationGraph sparkNet = new SparkComputationGraph(sc, getGraphConfCNN(12345, Updater.SGD), tm);
sparkNet.setCollectTrainingStats(true);
INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
for (int i = 0; i < seeds.length; i++) {
List<DataSet> list = getOneDataSetAsIndividalExamplesCNN(miniBatchSizePerWorker * nWorkers, seeds[i]);
JavaRDD<DataSet> rdd = sc.parallelize(list);
sparkNet.fit(rdd);
}
System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat()));
System.out.println("Initial (Spark) params: " + Arrays.toString(initialSparkParams.data().asFloat()));
System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
assertArrayEquals(initialParams.data().asFloat(), initialSparkParams.data().asFloat(), 1e-8f);
assertArrayEquals(finalParams.data().asFloat(), finalSparkParams.data().asFloat(), 1e-6f);
double sparkScore = sparkNet.getScore();
assertTrue(sparkScore > 0.0);
assertEquals(net.score(), sparkScore, 1e-3);
} finally {
sc.stop();
}
}
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TestCompareParameterAveragingSparkVsSingleMachine method testAverageEveryStep.
@Test
public void testAverageEveryStep() {
//Idea: averaging every step with SGD (SGD updater + optimizer) is mathematically identical to doing the learning
// on a single machine for synchronous distributed training
//BUT: This is *ONLY* the case if all workers get an identical number of examples. This won't be the case if
// we use RDD.randomSplit (which is what occurs if we use .fit(JavaRDD<DataSet> on a data set that needs splitting),
// which might give a number of examples that isn't divisible by number of workers (like 39 examples on 4 executors)
//This is also ONLY the case using SGD updater
int miniBatchSizePerWorker = 10;
int nWorkers = 4;
for (boolean saveUpdater : new boolean[] { true, false }) {
JavaSparkContext sc = getContext(nWorkers);
try {
//Do training locally, for 3 minibatches
int[] seeds = { 1, 2, 3 };
// CudaGridExecutioner executioner = (CudaGridExecutioner) Nd4j.getExecutioner();
MultiLayerNetwork net = new MultiLayerNetwork(getConf(12345, Updater.SGD));
net.init();
INDArray initialParams = net.params().dup();
for (int i = 0; i < seeds.length; i++) {
DataSet ds = getOneDataSet(miniBatchSizePerWorker * nWorkers, seeds[i]);
if (!saveUpdater)
net.setUpdater(null);
net.fit(ds);
}
INDArray finalParams = net.params().dup();
//Do training on Spark with one executor, for 3 separate minibatches
// TrainingMaster tm = getTrainingMaster(1, miniBatchSizePerWorker, saveUpdater);
ParameterAveragingTrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(1).averagingFrequency(1).batchSizePerWorker(miniBatchSizePerWorker).saveUpdater(saveUpdater).workerPrefetchNumBatches(0).rddTrainingApproach(RDDTrainingApproach.Export).build();
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, getConf(12345, Updater.SGD), tm);
sparkNet.setCollectTrainingStats(true);
INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
for (int i = 0; i < seeds.length; i++) {
List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSizePerWorker * nWorkers, seeds[i]);
JavaRDD<DataSet> rdd = sc.parallelize(list);
sparkNet.fit(rdd);
}
System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
System.out.println("Initial (Local) params: " + Arrays.toString(initialParams.data().asFloat()));
System.out.println("Initial (Spark) params: " + Arrays.toString(initialSparkParams.data().asFloat()));
System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
assertEquals(initialParams, initialSparkParams);
assertNotEquals(initialParams, finalParams);
assertEquals(finalParams, finalSparkParams);
double sparkScore = sparkNet.getScore();
assertTrue(sparkScore > 0.0);
assertEquals(net.score(), sparkScore, 1e-3);
} finally {
sc.stop();
}
}
}
use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.
the class TestTrainingStatsCollection method testStatsCollection.
@Test
public void testStatsCollection() throws Exception {
int nWorkers = 4;
SparkConf sparkConf = new SparkConf();
sparkConf.setMaster("local[" + nWorkers + "]");
sparkConf.setAppName("Test");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
try {
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new DenseLayer.Builder().nIn(10).nOut(10).build()).layer(1, new OutputLayer.Builder().nIn(10).nOut(10).build()).pretrain(false).backprop(true).build();
int miniBatchSizePerWorker = 10;
int averagingFrequency = 5;
int numberOfAveragings = 3;
int totalExamples = nWorkers * miniBatchSizePerWorker * averagingFrequency * numberOfAveragings;
Nd4j.getRandom().setSeed(12345);
List<DataSet> list = new ArrayList<>();
for (int i = 0; i < totalExamples; i++) {
INDArray f = Nd4j.rand(1, 10);
INDArray l = Nd4j.rand(1, 10);
DataSet ds = new DataSet(f, l);
list.add(ds);
}
JavaRDD<DataSet> rdd = sc.parallelize(list);
rdd.repartition(4);
ParameterAveragingTrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(nWorkers, 1).averagingFrequency(averagingFrequency).batchSizePerWorker(miniBatchSizePerWorker).saveUpdater(true).workerPrefetchNumBatches(0).repartionData(Repartition.Always).build();
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, tm);
sparkNet.setCollectTrainingStats(true);
sparkNet.fit(rdd);
//Collect the expected keys:
List<String> expectedStatNames = new ArrayList<>();
Class<?>[] classes = new Class[] { CommonSparkTrainingStats.class, ParameterAveragingTrainingMasterStats.class, ParameterAveragingTrainingWorkerStats.class };
String[] fieldNames = new String[] { "columnNames", "columnNames", "columnNames" };
for (int i = 0; i < classes.length; i++) {
Field field = classes[i].getDeclaredField(fieldNames[i]);
field.setAccessible(true);
Object f = field.get(null);
Collection<String> c = (Collection<String>) f;
expectedStatNames.addAll(c);
}
System.out.println(expectedStatNames);
SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
Set<String> actualKeySet = stats.getKeySet();
assertEquals(expectedStatNames.size(), actualKeySet.size());
for (String s : stats.getKeySet()) {
assertTrue(expectedStatNames.contains(s));
assertNotNull(stats.getValue(s));
}
String statsAsString = stats.statsAsString();
System.out.println(statsAsString);
//One line per stat
assertEquals(actualKeySet.size(), statsAsString.split("\n").length);
//Go through nested stats
//First: master stats
assertTrue(stats instanceof ParameterAveragingTrainingMasterStats);
ParameterAveragingTrainingMasterStats masterStats = (ParameterAveragingTrainingMasterStats) stats;
List<EventStats> exportTimeStats = masterStats.getParameterAveragingMasterExportTimesMs();
assertEquals(1, exportTimeStats.size());
assertDurationGreaterZero(exportTimeStats);
assertNonNullFields(exportTimeStats);
assertExpectedNumberMachineIdsJvmIdsThreadIds(exportTimeStats, 1, 1, 1);
List<EventStats> countRddTime = masterStats.getParameterAveragingMasterCountRddSizeTimesMs();
//occurs once per fit
assertEquals(1, countRddTime.size());
assertDurationGreaterEqZero(countRddTime);
assertNonNullFields(countRddTime);
//should occur only in master once
assertExpectedNumberMachineIdsJvmIdsThreadIds(countRddTime, 1, 1, 1);
List<EventStats> broadcastCreateTime = masterStats.getParameterAveragingMasterBroadcastCreateTimesMs();
assertEquals(numberOfAveragings, broadcastCreateTime.size());
assertDurationGreaterEqZero(broadcastCreateTime);
assertNonNullFields(broadcastCreateTime);
//only 1 thread for master
assertExpectedNumberMachineIdsJvmIdsThreadIds(broadcastCreateTime, 1, 1, 1);
List<EventStats> fitTimes = masterStats.getParameterAveragingMasterFitTimesMs();
//i.e., number of times fit(JavaRDD<DataSet>) was called
assertEquals(1, fitTimes.size());
assertDurationGreaterZero(fitTimes);
assertNonNullFields(fitTimes);
//only 1 thread for master
assertExpectedNumberMachineIdsJvmIdsThreadIds(fitTimes, 1, 1, 1);
List<EventStats> splitTimes = masterStats.getParameterAveragingMasterSplitTimesMs();
//Splitting of the data set is executed once only (i.e., one fit(JavaRDD<DataSet>) call)
assertEquals(1, splitTimes.size());
assertDurationGreaterEqZero(splitTimes);
assertNonNullFields(splitTimes);
//only 1 thread for master
assertExpectedNumberMachineIdsJvmIdsThreadIds(splitTimes, 1, 1, 1);
List<EventStats> aggregateTimesMs = masterStats.getParamaterAveragingMasterAggregateTimesMs();
assertEquals(numberOfAveragings, aggregateTimesMs.size());
assertDurationGreaterEqZero(aggregateTimesMs);
assertNonNullFields(aggregateTimesMs);
//only 1 thread for master
assertExpectedNumberMachineIdsJvmIdsThreadIds(aggregateTimesMs, 1, 1, 1);
List<EventStats> processParamsTimesMs = masterStats.getParameterAveragingMasterProcessParamsUpdaterTimesMs();
assertEquals(numberOfAveragings, processParamsTimesMs.size());
assertDurationGreaterEqZero(processParamsTimesMs);
assertNonNullFields(processParamsTimesMs);
//only 1 thread for master
assertExpectedNumberMachineIdsJvmIdsThreadIds(processParamsTimesMs, 1, 1, 1);
List<EventStats> repartitionTimesMs = masterStats.getParameterAveragingMasterRepartitionTimesMs();
assertEquals(numberOfAveragings, repartitionTimesMs.size());
assertDurationGreaterEqZero(repartitionTimesMs);
assertNonNullFields(repartitionTimesMs);
//only 1 thread for master
assertExpectedNumberMachineIdsJvmIdsThreadIds(repartitionTimesMs, 1, 1, 1);
//Second: Common spark training stats
SparkTrainingStats commonStats = masterStats.getNestedTrainingStats();
assertNotNull(commonStats);
assertTrue(commonStats instanceof CommonSparkTrainingStats);
CommonSparkTrainingStats cStats = (CommonSparkTrainingStats) commonStats;
List<EventStats> workerFlatMapTotalTimeMs = cStats.getWorkerFlatMapTotalTimeMs();
assertEquals(numberOfAveragings * nWorkers, workerFlatMapTotalTimeMs.size());
assertDurationGreaterZero(workerFlatMapTotalTimeMs);
assertNonNullFields(workerFlatMapTotalTimeMs);
assertExpectedNumberMachineIdsJvmIdsThreadIds(workerFlatMapTotalTimeMs, 1, 1, nWorkers);
List<EventStats> workerFlatMapGetInitialModelTimeMs = cStats.getWorkerFlatMapGetInitialModelTimeMs();
assertEquals(numberOfAveragings * nWorkers, workerFlatMapGetInitialModelTimeMs.size());
assertDurationGreaterEqZero(workerFlatMapGetInitialModelTimeMs);
assertNonNullFields(workerFlatMapGetInitialModelTimeMs);
assertExpectedNumberMachineIdsJvmIdsThreadIds(workerFlatMapGetInitialModelTimeMs, 1, 1, nWorkers);
List<EventStats> workerFlatMapDataSetGetTimesMs = cStats.getWorkerFlatMapDataSetGetTimesMs();
int numMinibatchesProcessed = workerFlatMapDataSetGetTimesMs.size();
//1 for every time we get a data set
int expectedNumMinibatchesProcessed = numberOfAveragings * nWorkers * averagingFrequency;
//Sometimes random split is just bad - some executors might miss out on getting the expected amount of data
assertTrue(numMinibatchesProcessed >= expectedNumMinibatchesProcessed - 5);
List<EventStats> workerFlatMapProcessMiniBatchTimesMs = cStats.getWorkerFlatMapProcessMiniBatchTimesMs();
assertTrue(workerFlatMapProcessMiniBatchTimesMs.size() >= numberOfAveragings * nWorkers * averagingFrequency - 5);
assertDurationGreaterEqZero(workerFlatMapProcessMiniBatchTimesMs);
assertNonNullFields(workerFlatMapDataSetGetTimesMs);
assertExpectedNumberMachineIdsJvmIdsThreadIds(workerFlatMapDataSetGetTimesMs, 1, 1, nWorkers);
//Third: ParameterAveragingTrainingWorker stats
SparkTrainingStats paramAvgStats = cStats.getNestedTrainingStats();
assertNotNull(paramAvgStats);
assertTrue(paramAvgStats instanceof ParameterAveragingTrainingWorkerStats);
ParameterAveragingTrainingWorkerStats pStats = (ParameterAveragingTrainingWorkerStats) paramAvgStats;
List<EventStats> parameterAveragingWorkerBroadcastGetValueTimeMs = pStats.getParameterAveragingWorkerBroadcastGetValueTimeMs();
assertEquals(numberOfAveragings * nWorkers, parameterAveragingWorkerBroadcastGetValueTimeMs.size());
assertDurationGreaterEqZero(parameterAveragingWorkerBroadcastGetValueTimeMs);
assertNonNullFields(parameterAveragingWorkerBroadcastGetValueTimeMs);
assertExpectedNumberMachineIdsJvmIdsThreadIds(parameterAveragingWorkerBroadcastGetValueTimeMs, 1, 1, nWorkers);
List<EventStats> parameterAveragingWorkerInitTimeMs = pStats.getParameterAveragingWorkerInitTimeMs();
assertEquals(numberOfAveragings * nWorkers, parameterAveragingWorkerInitTimeMs.size());
assertDurationGreaterEqZero(parameterAveragingWorkerInitTimeMs);
assertNonNullFields(parameterAveragingWorkerInitTimeMs);
assertExpectedNumberMachineIdsJvmIdsThreadIds(parameterAveragingWorkerInitTimeMs, 1, 1, nWorkers);
List<EventStats> parameterAveragingWorkerFitTimesMs = pStats.getParameterAveragingWorkerFitTimesMs();
assertTrue(parameterAveragingWorkerFitTimesMs.size() >= numberOfAveragings * nWorkers * averagingFrequency - 5);
assertDurationGreaterEqZero(parameterAveragingWorkerFitTimesMs);
assertNonNullFields(parameterAveragingWorkerFitTimesMs);
assertExpectedNumberMachineIdsJvmIdsThreadIds(parameterAveragingWorkerFitTimesMs, 1, 1, nWorkers);
assertNull(pStats.getNestedTrainingStats());
//Finally: try exporting stats
String tempDir = System.getProperty("java.io.tmpdir");
String outDir = FilenameUtils.concat(tempDir, "dl4j_testTrainingStatsCollection");
stats.exportStatFiles(outDir, sc.sc());
String htmlPlotsPath = FilenameUtils.concat(outDir, "AnalysisPlots.html");
StatsUtils.exportStatsAsHtml(stats, htmlPlotsPath, sc);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
StatsUtils.exportStatsAsHTML(stats, baos);
baos.close();
byte[] bytes = baos.toByteArray();
String str = new String(bytes, "UTF-8");
// System.out.println(str);
} finally {
sc.stop();
}
}
use of org.apache.spark.api.java.JavaSparkContext in project azure-tools-for-java by Microsoft.
the class JavaSparkPi method main.
public static void main(String[] args) throws Exception {
//use this line if you want to run your application in the cluster
//SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi").setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
int n = 100000 * slices;
List<Integer> l = new ArrayList<Integer>(n);
for (int i = 0; i < n; i++) {
l.add(i);
}
JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
int count = dataSet.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
return (x * x + y * y < 1) ? 1 : 0;
}
}).reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) {
return integer + integer2;
}
});
System.out.println("Pi is roughly " + 4.0 * count / n);
jsc.stop();
}
use of org.apache.spark.api.java.JavaSparkContext in project geode by apache.
the class OQLJavaDemo method main.
public static void main(String[] argv) {
if (argv.length != 1) {
System.err.printf("Usage: OQLJavaDemo <locators>\n");
return;
}
SparkConf conf = new SparkConf().setAppName("OQLJavaDemo");
// "192.168.1.47[10335]"
conf.set(GeodeLocatorPropKey, argv[0]);
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame df = javaFunctions(sqlContext).geodeOQL("select * from /str_str_region");
System.out.println("======= DataFrame =======\n");
df.show();
sc.stop();
}
Aggregations