Search in sources :

Example 6 with SparkComputationGraph

use of org.deeplearning4j.spark.impl.graph.SparkComputationGraph in project deeplearning4j by deeplearning4j.

the class TestPreProcessedData method testPreprocessedDataCompGraphDataSet.

@Test
public void testPreprocessedDataCompGraphDataSet() {
    //Test _loading_ of preprocessed DataSet data
    int dataSetObjSize = 5;
    int batchSizePerExecutor = 10;
    String path = FilenameUtils.concat(System.getProperty("java.io.tmpdir"), "dl4j_testpreprocdata2");
    File f = new File(path);
    if (f.exists())
        f.delete();
    f.mkdir();
    DataSetIterator iter = new IrisDataSetIterator(5, 150);
    int i = 0;
    while (iter.hasNext()) {
        File f2 = new File(FilenameUtils.concat(path, "data" + (i++) + ".bin"));
        iter.next().save(f2);
    }
    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).graphBuilder().addInputs("in").addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(4).nOut(3).activation(Activation.TANH).build(), "in").addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(3).nOut(3).activation(Activation.SOFTMAX).build(), "0").setOutputs("1").pretrain(false).backprop(true).build();
    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);
    sparkNet.fit("file:///" + path.replaceAll("\\\\", "/"));
    SparkTrainingStats sts = sparkNet.getSparkTrainingStats();
    //4 'fits' per averaging (4 executors, 1 averaging freq); 10 examples each -> 40 examples per fit. 150/40 = 3 averagings (round down); 3*4 = 12
    int expNumFits = 12;
    //Unfortunately: perfect partitioning isn't guaranteed by SparkUtils.balancedRandomSplit (esp. if original partitions are all size 1
    // which appears to be occurring at least some of the time), but we should get close to what we expect...
    assertTrue(Math.abs(expNumFits - sts.getValue("ParameterAveragingWorkerFitTimesMs").size()) < 3);
    assertEquals(3, sts.getValue("ParameterAveragingMasterMapPartitionsTimesMs").size());
}
Also used : SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) ParameterAveragingTrainingMaster(org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) File(java.io.File) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) PortableDataStreamDataSetIterator(org.deeplearning4j.spark.iterator.PortableDataStreamDataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 7 with SparkComputationGraph

use of org.deeplearning4j.spark.impl.graph.SparkComputationGraph in project deeplearning4j by deeplearning4j.

the class TestPreProcessedData method testPreprocessedDataCompGraphMultiDataSet.

@Test
public void testPreprocessedDataCompGraphMultiDataSet() throws IOException {
    //Test _loading_ of preprocessed MultiDataSet data
    int dataSetObjSize = 5;
    int batchSizePerExecutor = 10;
    String path = FilenameUtils.concat(System.getProperty("java.io.tmpdir"), "dl4j_testpreprocdata3");
    File f = new File(path);
    if (f.exists())
        f.delete();
    f.mkdir();
    DataSetIterator iter = new IrisDataSetIterator(5, 150);
    int i = 0;
    while (iter.hasNext()) {
        File f2 = new File(FilenameUtils.concat(path, "data" + (i++) + ".bin"));
        DataSet ds = iter.next();
        MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
        mds.save(f2);
    }
    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).graphBuilder().addInputs("in").addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(4).nOut(3).activation(Activation.TANH).build(), "in").addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(3).nOut(3).activation(Activation.SOFTMAX).build(), "0").setOutputs("1").pretrain(false).backprop(true).build();
    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);
    sparkNet.fitMultiDataSet("file:///" + path.replaceAll("\\\\", "/"));
    SparkTrainingStats sts = sparkNet.getSparkTrainingStats();
    //4 'fits' per averaging (4 executors, 1 averaging freq); 10 examples each -> 40 examples per fit. 150/40 = 3 averagings (round down); 3*4 = 12
    int expNumFits = 12;
    //Unfortunately: perfect partitioning isn't guaranteed by SparkUtils.balancedRandomSplit (esp. if original partitions are all size 1
    // which appears to be occurring at least some of the time), but we should get close to what we expect...
    assertTrue(Math.abs(expNumFits - sts.getValue("ParameterAveragingWorkerFitTimesMs").size()) < 3);
    assertEquals(3, sts.getValue("ParameterAveragingMasterMapPartitionsTimesMs").size());
}
Also used : SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) ParameterAveragingTrainingMaster(org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) File(java.io.File) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) PortableDataStreamDataSetIterator(org.deeplearning4j.spark.iterator.PortableDataStreamDataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 8 with SparkComputationGraph

use of org.deeplearning4j.spark.impl.graph.SparkComputationGraph in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testFitViaStringPathsCompGraph.

@Test
public void testFitViaStringPathsCompGraph() throws Exception {
    Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPathsCG");
    Path tempDir2 = Files.createTempDirectory("DL4J-testFitViaStringPathsCG-MDS");
    File tempDirF = tempDir.toFile();
    File tempDirF2 = tempDir2.toFile();
    tempDirF.deleteOnExit();
    tempDirF2.deleteOnExit();
    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        File nextFile2 = new File(tempDirF2, i + ".bin");
        DataSet ds = iter.next();
        MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
        ds.save(nextFile);
        mds.save(nextFile2);
        i++;
    }
    System.out.println("Saved to: " + tempDirF.getAbsolutePath());
    System.out.println("Saved to: " + tempDirF2.getAbsolutePath());
    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).graphBuilder().addInputs("in").addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build(), "in").addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build(), "0").setOutputs("1").pretrain(false).backprop(true).build();
    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).workerPrefetchNumBatches(0).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);
    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }
    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);
    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);
    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
    //Same thing, buf for MultiDataSet objects:
    config = new Configuration();
    hdfs = FileSystem.get(tempDir2.toUri(), config);
    fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false);
    paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }
    paramsBefore = sparkNet.getNetwork().params().dup();
    pathRdd = sc.parallelize(paths);
    sparkNet.fitPathsMultiDataSet(pathRdd);
    paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);
    stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
}
Also used : OutputLayer(org.deeplearning4j.nn.conf.layers.OutputLayer) SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) Configuration(org.apache.hadoop.conf.Configuration) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) FileSystem(org.apache.hadoop.fs.FileSystem) Path(java.nio.file.Path) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) File(java.io.File) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 9 with SparkComputationGraph

use of org.deeplearning4j.spark.impl.graph.SparkComputationGraph in project deeplearning4j by deeplearning4j.

the class TestMiscFunctions method testFeedForwardWithKeyGraph.

@Test
public void testFeedForwardWithKeyGraph() {
    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().weightInit(WeightInit.XAVIER).graphBuilder().addInputs("in1", "in2").addLayer("0", new DenseLayer.Builder().nIn(4).nOut(3).build(), "in1").addLayer("1", new DenseLayer.Builder().nIn(4).nOut(3).build(), "in2").addLayer("2", new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(6).nOut(3).activation(Activation.SOFTMAX).build(), "0", "1").setOutputs("2").build();
    ComputationGraph net = new ComputationGraph(conf);
    net.init();
    DataSetIterator iter = new IrisDataSetIterator(150, 150);
    DataSet ds = iter.next();
    List<INDArray> expected = new ArrayList<>();
    List<Tuple2<Integer, INDArray[]>> mapFeatures = new ArrayList<>();
    int count = 0;
    int arrayCount = 0;
    Random r = new Random(12345);
    while (count < 150) {
        //1 to 5 inclusive examples
        int exampleCount = r.nextInt(5) + 1;
        if (count + exampleCount > 150)
            exampleCount = 150 - count;
        INDArray subset = ds.getFeatures().get(NDArrayIndex.interval(count, count + exampleCount), NDArrayIndex.all());
        expected.add(net.outputSingle(false, subset, subset));
        mapFeatures.add(new Tuple2<>(arrayCount, new INDArray[] { subset, subset }));
        arrayCount++;
        count += exampleCount;
    }
    JavaPairRDD<Integer, INDArray[]> rdd = sc.parallelizePairs(mapFeatures);
    SparkComputationGraph graph = new SparkComputationGraph(sc, net, null);
    Map<Integer, INDArray[]> map = graph.feedForwardWithKey(rdd, 16).collectAsMap();
    for (int i = 0; i < expected.size(); i++) {
        INDArray exp = expected.get(i);
        INDArray act = map.get(i)[0];
        assertEquals(exp, act);
    }
}
Also used : OutputLayer(org.deeplearning4j.nn.conf.layers.OutputLayer) SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSet(org.nd4j.linalg.dataset.api.DataSet) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) ComputationGraph(org.deeplearning4j.nn.graph.ComputationGraph) SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 10 with SparkComputationGraph

use of org.deeplearning4j.spark.impl.graph.SparkComputationGraph in project deeplearning4j by deeplearning4j.

the class TestCompareParameterAveragingSparkVsSingleMachine method testOneExecutorGraph.

@Test
public void testOneExecutorGraph() {
    //Idea: single worker/executor on Spark should give identical results to a single machine
    int miniBatchSize = 10;
    int nWorkers = 1;
    for (boolean saveUpdater : new boolean[] { true, false }) {
        JavaSparkContext sc = getContext(nWorkers);
        try {
            //Do training locally, for 3 minibatches
            int[] seeds = { 1, 2, 3 };
            ComputationGraph net = new ComputationGraph(getGraphConf(12345, Updater.RMSPROP));
            net.init();
            INDArray initialParams = net.params().dup();
            for (int i = 0; i < seeds.length; i++) {
                DataSet ds = getOneDataSet(miniBatchSize, seeds[i]);
                if (!saveUpdater)
                    net.setUpdater(null);
                net.fit(ds);
            }
            INDArray finalParams = net.params().dup();
            //Do training on Spark with one executor, for 3 separate minibatches
            TrainingMaster tm = getTrainingMaster(1, miniBatchSize, saveUpdater);
            SparkComputationGraph sparkNet = new SparkComputationGraph(sc, getGraphConf(12345, Updater.RMSPROP), tm);
            sparkNet.setCollectTrainingStats(true);
            INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
            for (int i = 0; i < seeds.length; i++) {
                List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSize, seeds[i]);
                JavaRDD<DataSet> rdd = sc.parallelize(list);
                sparkNet.fit(rdd);
            }
            INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
            assertEquals(initialParams, initialSparkParams);
            assertNotEquals(initialParams, finalParams);
            assertEquals(finalParams, finalSparkParams);
        } finally {
            sc.stop();
        }
    }
}
Also used : SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ComputationGraph(org.deeplearning4j.nn.graph.ComputationGraph) SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) TrainingMaster(org.deeplearning4j.spark.api.TrainingMaster) Test(org.junit.Test)

Aggregations

SparkComputationGraph (org.deeplearning4j.spark.impl.graph.SparkComputationGraph)11 Test (org.junit.Test)9 ComputationGraphConfiguration (org.deeplearning4j.nn.conf.ComputationGraphConfiguration)8 DataSet (org.nd4j.linalg.dataset.DataSet)7 BaseSparkTest (org.deeplearning4j.spark.BaseSparkTest)6 INDArray (org.nd4j.linalg.api.ndarray.INDArray)6 IrisDataSetIterator (org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator)5 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)5 ComputationGraph (org.deeplearning4j.nn.graph.ComputationGraph)5 DataSetIterator (org.nd4j.linalg.dataset.api.iterator.DataSetIterator)5 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4 TrainingMaster (org.deeplearning4j.spark.api.TrainingMaster)4 SparkTrainingStats (org.deeplearning4j.spark.api.stats.SparkTrainingStats)4 MultiDataSet (org.nd4j.linalg.dataset.MultiDataSet)4 File (java.io.File)3 LabeledPoint (org.apache.spark.mllib.regression.LabeledPoint)3 OutputLayer (org.deeplearning4j.nn.conf.layers.OutputLayer)3 ParameterAveragingTrainingMaster (org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster)3 MnistDataSetIterator (org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator)2 MultiLayerConfiguration (org.deeplearning4j.nn.conf.MultiLayerConfiguration)2