Search in sources :

Example 1 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class BaseOptimizer method optimize.

/**
     * Optimize call. This runs the optimizer.
     * @return whether it converged or not
     */
// TODO add flag to allow retaining state between mini batches and when to apply updates
@Override
public boolean optimize() {
    //validate the input before training
    INDArray gradient;
    INDArray searchDirection;
    INDArray parameters;
    Pair<Gradient, Double> pair = gradientAndScore();
    if (searchState.isEmpty()) {
        searchState.put(GRADIENT_KEY, pair.getFirst().gradient());
        //Only do this once
        setupSearchState(pair);
    } else {
        searchState.put(GRADIENT_KEY, pair.getFirst().gradient());
    }
    /*
         * Commented out for now; this has been problematic for testing/debugging
         * Revisit & re-enable later. */
    for (TerminationCondition condition : terminationConditions) {
        if (condition.terminate(0.0, 0.0, new Object[] { pair.getFirst().gradient() })) {
            log.info("Hit termination condition " + condition.getClass().getName());
            return true;
        }
    }
    //calculate initial search direction
    preProcessLine();
    for (int i = 0; i < conf.getNumIterations(); i++) {
        gradient = (INDArray) searchState.get(GRADIENT_KEY);
        searchDirection = (INDArray) searchState.get(SEARCH_DIR);
        parameters = (INDArray) searchState.get(PARAMS_KEY);
        //perform one line search optimization
        try {
            step = lineMaximizer.optimize(parameters, gradient, searchDirection);
        } catch (InvalidStepException e) {
            log.warn("Invalid step...continuing another iteration: {}", e.getMessage());
            step = 0.0;
        }
        //Update parameters based on final/best step size returned by line search:
        if (step != 0.0) {
            //Calculate params. given step size
            stepFunction.step(parameters, searchDirection, step);
            model.setParams(parameters);
        } else {
            log.debug("Step size returned by line search is 0.0.");
        }
        pair = gradientAndScore();
        //updates searchDirection
        postStep(pair.getFirst().gradient());
        //invoke listeners
        int iterationCount = BaseOptimizer.getIterationCount(model);
        for (IterationListener listener : iterationListeners) listener.iterationDone(model, iterationCount);
        //check for termination conditions based on absolute change in score
        checkTerminalConditions(pair.getFirst().gradient(), oldScore, score, i);
        incrementIterationCount(model, 1);
    }
    return true;
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) InvalidStepException(org.deeplearning4j.exception.InvalidStepException)

Example 2 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class MultiLayerUpdater method update.

@Override
public void update(Layer layer, Gradient gradient, int iteration, int batchSize) {
    MultiLayerNetwork mln = (MultiLayerNetwork) layer;
    Gradient[] layerGradients = new Gradient[layerUpdaters.length];
    for (int i = 0; i < layerGradients.length; i++) layerGradients[i] = new DefaultGradient();
    for (Map.Entry<String, INDArray> gradientPair : gradient.gradientForVariable().entrySet()) {
        String key = gradientPair.getKey();
        int idx = key.indexOf('_');
        if (idx == -1)
            throw new IllegalStateException("Invalid key: MuliLayerNetwork Gradient key does not have layer separator: \"" + key + "\"");
        int layerIdx = Integer.parseInt(key.substring(0, idx));
        String newKey = key.substring(idx + 1);
        layerGradients[layerIdx].gradientForVariable().put(newKey, gradientPair.getValue());
    }
    for (int i = 0; i < layerUpdaters.length; i++) {
        layerUpdaters[i].update(mln.getLayer(i), layerGradients[i], iteration, batchSize);
    }
}
Also used : DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) Map(java.util.Map)

Example 3 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class TestCompGraphCNN method testBackwardIrisBasic.

@Test
public void testBackwardIrisBasic() {
    //Now: set parameters of both networks to be identical. Then feedforward, and check we get the same outputs
    Nd4j.getRandom().setSeed(12345);
    INDArray input = ds.getFeatureMatrix();
    INDArray labels = ds.getLabels();
    graph.setInput(0, input.dup());
    graph.setLabel(0, labels.dup());
    //Compute gradients
    graph.computeGradientAndScore();
    Pair<Gradient, Double> graphGradScore = graph.gradientAndScore();
// Check gradients
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Test(org.junit.Test)

Example 4 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class TestVariableLengthTSCG method testVariableLengthSimple.

@Test
public void testVariableLengthSimple() {
    //Test: Simple RNN layer + RNNOutputLayer
    //Length of 4 for standard
    //Length of 5 with last time step output mask set to 0
    //Expect the same gradients etc in both cases...
    int[] miniBatchSizes = { 1, 2, 5 };
    int nOut = 1;
    Random r = new Random(12345);
    for (int nExamples : miniBatchSizes) {
        Nd4j.getRandom().setSeed(12345);
        ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).updater(Updater.SGD).learningRate(0.1).seed(12345).graphBuilder().addInputs("in").addLayer("0", new GravesLSTM.Builder().activation(Activation.TANH).nIn(2).nOut(2).build(), "in").addLayer("1", new RnnOutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MSE).nIn(2).nOut(1).build(), "0").setOutputs("1").build();
        ComputationGraph net = new ComputationGraph(conf);
        net.init();
        INDArray in1 = Nd4j.rand(new int[] { nExamples, 2, 4 });
        INDArray in2 = Nd4j.rand(new int[] { nExamples, 2, 5 });
        in2.put(new INDArrayIndex[] { NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.interval(0, 3, true) }, in1);
        assertEquals(in1, in2.get(NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.interval(0, 4)));
        INDArray labels1 = Nd4j.rand(new int[] { nExamples, 1, 4 });
        INDArray labels2 = Nd4j.create(nExamples, 1, 5);
        labels2.put(new INDArrayIndex[] { NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.interval(0, 3, true) }, labels1);
        assertEquals(labels1, labels2.get(NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.interval(0, 4)));
        INDArray labelMask = Nd4j.ones(nExamples, 5);
        for (int j = 0; j < nExamples; j++) {
            labelMask.putScalar(new int[] { j, 4 }, 0);
        }
        net.setInput(0, in1);
        net.setLabel(0, labels1);
        net.computeGradientAndScore();
        double score1 = net.score();
        Gradient g1 = net.gradient();
        net.setInput(0, in2);
        net.setLabel(0, labels2);
        net.setLayerMaskArrays(null, new INDArray[] { labelMask });
        net.computeGradientAndScore();
        double score2 = net.score();
        Gradient g2 = net.gradient();
        //Scores and gradients should be identical for two cases (given mask array)
        assertEquals(score1, score2, 0.0);
        Map<String, INDArray> g1map = g1.gradientForVariable();
        Map<String, INDArray> g2map = g2.gradientForVariable();
        for (String s : g1map.keySet()) {
            INDArray g1s = g1map.get(s);
            INDArray g2s = g2map.get(s);
            assertEquals(s, g1s, g2s);
        }
        // (a) score, (b) gradients
        for (int i = 0; i < nExamples; i++) {
            for (int j = 0; j < nOut; j++) {
                double d = r.nextDouble();
                labels2.putScalar(new int[] { i, j, 4 }, d);
            }
            net.setLabel(0, labels2);
            net.computeGradientAndScore();
            double score2a = net.score();
            Gradient g2a = net.gradient();
            assertEquals(score2, score2a, 0.0);
            for (String s : g2map.keySet()) {
                INDArray g2s = g2map.get(s);
                INDArray g2sa = g2a.getGradientFor(s);
                assertEquals(s, g2s, g2sa);
            }
        }
    }
}
Also used : RnnOutputLayer(org.deeplearning4j.nn.conf.layers.RnnOutputLayer) Gradient(org.deeplearning4j.nn.gradient.Gradient) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) Random(java.util.Random) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) Test(org.junit.Test)

Example 5 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class GradientCheckUtil method checkGradients.

/**
     * Check backprop gradients for a MultiLayerNetwork.
     * @param mln MultiLayerNetwork to test. This must be initialized.
     * @param epsilon Usually on the order/ of 1e-4 or so.
     * @param maxRelError Maximum relative error. Usually < 1e-5 or so, though maybe more for deep networks or those with nonlinear activation
     * @param minAbsoluteError Minimum absolute error to cause a failure. Numerical gradients can be non-zero due to precision issues.
     *                         For example, 0.0 vs. 1e-18: relative error is 1.0, but not really a failure
     * @param print Whether to print full pass/failure details for each parameter gradient
     * @param exitOnFirstError If true: return upon first failure. If false: continue checking even if
     *  one parameter gradient has failed. Typically use false for debugging, true for unit tests.
     * @param input Input array to use for forward pass. May be mini-batch data.
     * @param labels Labels/targets to use to calculate backprop gradient. May be mini-batch data.
     * @return true if gradients are passed, false otherwise.
     */
public static boolean checkGradients(MultiLayerNetwork mln, double epsilon, double maxRelError, double minAbsoluteError, boolean print, boolean exitOnFirstError, INDArray input, INDArray labels) {
    //Basic sanity checks on input:
    if (epsilon <= 0.0 || epsilon > 0.1)
        throw new IllegalArgumentException("Invalid epsilon: expect epsilon in range (0,0.1], usually 1e-4 or so");
    if (maxRelError <= 0.0 || maxRelError > 0.25)
        throw new IllegalArgumentException("Invalid maxRelativeError: " + maxRelError);
    if (!(mln.getOutputLayer() instanceof IOutputLayer))
        throw new IllegalArgumentException("Cannot check backprop gradients without OutputLayer");
    //Check network configuration:
    int layerCount = 0;
    for (NeuralNetConfiguration n : mln.getLayerWiseConfigurations().getConfs()) {
        org.deeplearning4j.nn.conf.Updater u = n.getLayer().getUpdater();
        if (u == org.deeplearning4j.nn.conf.Updater.SGD) {
            //Must have LR of 1.0
            double lr = n.getLayer().getLearningRate();
            if (lr != 1.0) {
                throw new IllegalStateException("When using SGD updater, must also use lr=1.0 for layer " + layerCount + "; got " + u + " with lr=" + lr + " for layer \"" + n.getLayer().getLayerName() + "\"");
            }
        } else if (u != org.deeplearning4j.nn.conf.Updater.NONE) {
            throw new IllegalStateException("Must have Updater.NONE (or SGD + lr=1.0) for layer " + layerCount + "; got " + u);
        }
        double dropout = n.getLayer().getDropOut();
        if (n.isUseRegularization() && dropout != 0.0) {
            throw new IllegalStateException("Must have dropout == 0.0 for gradient checks - got dropout = " + dropout + " for layer " + layerCount);
        }
        IActivation activation = n.getLayer().getActivationFn();
        if (activation != null) {
            if (!VALID_ACTIVATION_FUNCTIONS.contains(activation.getClass())) {
                log.warn("Layer " + layerCount + " is possibly using an unsuitable activation function: " + activation.getClass() + ". Activation functions for gradient checks must be smooth (like sigmoid, tanh, softmax) and not " + "contain discontinuities like ReLU or LeakyReLU (these may cause spurious failures)");
            }
        }
    }
    mln.setInput(input);
    mln.setLabels(labels);
    mln.computeGradientAndScore();
    Pair<Gradient, Double> gradAndScore = mln.gradientAndScore();
    Updater updater = UpdaterCreator.getUpdater(mln);
    updater.update(mln, gradAndScore.getFirst(), 0, mln.batchSize());
    //need dup: gradients are a *view* of the full gradient array (which will change every time backprop is done)
    INDArray gradientToCheck = gradAndScore.getFirst().gradient().dup();
    //need dup: params are a *view* of full parameters
    INDArray originalParams = mln.params().dup();
    int nParams = originalParams.length();
    Map<String, INDArray> paramTable = mln.paramTable();
    List<String> paramNames = new ArrayList<>(paramTable.keySet());
    int[] paramEnds = new int[paramNames.size()];
    paramEnds[0] = paramTable.get(paramNames.get(0)).length();
    for (int i = 1; i < paramEnds.length; i++) {
        paramEnds[i] = paramEnds[i - 1] + paramTable.get(paramNames.get(i)).length();
    }
    int totalNFailures = 0;
    double maxError = 0.0;
    DataSet ds = new DataSet(input, labels);
    int currParamNameIdx = 0;
    //Assumption here: params is a view that we can modify in-place
    INDArray params = mln.params();
    for (int i = 0; i < nParams; i++) {
        //Get param name
        if (i >= paramEnds[currParamNameIdx]) {
            currParamNameIdx++;
        }
        String paramName = paramNames.get(currParamNameIdx);
        //(w+epsilon): Do forward pass and score
        double origValue = params.getDouble(i);
        params.putScalar(i, origValue + epsilon);
        double scorePlus = mln.score(ds, true);
        //(w-epsilon): Do forward pass and score
        params.putScalar(i, origValue - epsilon);
        double scoreMinus = mln.score(ds, true);
        //Reset original param value
        params.putScalar(i, origValue);
        //Calculate numerical parameter gradient:
        double scoreDelta = scorePlus - scoreMinus;
        double numericalGradient = scoreDelta / (2 * epsilon);
        if (Double.isNaN(numericalGradient))
            throw new IllegalStateException("Numerical gradient was NaN for parameter " + i + " of " + nParams);
        double backpropGradient = gradientToCheck.getDouble(i);
        //http://cs231n.github.io/neural-networks-3/#gradcheck
        //use mean centered
        double relError = Math.abs(backpropGradient - numericalGradient) / (Math.abs(numericalGradient) + Math.abs(backpropGradient));
        if (backpropGradient == 0.0 && numericalGradient == 0.0)
            //Edge case: i.e., RNNs with time series length of 1.0
            relError = 0.0;
        if (relError > maxError)
            maxError = relError;
        if (relError > maxRelError || Double.isNaN(relError)) {
            double absError = Math.abs(backpropGradient - numericalGradient);
            if (absError < minAbsoluteError) {
                log.info("Param " + i + " (" + paramName + ") passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + "; absolute error = " + absError + " < minAbsoluteError = " + minAbsoluteError);
            } else {
                if (print)
                    log.info("Param " + i + " (" + paramName + ") FAILED: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + ", scorePlus=" + scorePlus + ", scoreMinus= " + scoreMinus);
                if (exitOnFirstError)
                    return false;
                totalNFailures++;
            }
        } else if (print) {
            log.info("Param " + i + " (" + paramName + ") passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError);
        }
    }
    if (print) {
        int nPass = nParams - totalNFailures;
        log.info("GradientCheckUtil.checkGradients(): " + nParams + " params checked, " + nPass + " passed, " + totalNFailures + " failed. Largest relative error = " + maxError);
    }
    return totalNFailures == 0;
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DataSet(org.nd4j.linalg.dataset.DataSet) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) ArrayList(java.util.ArrayList) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) IActivation(org.nd4j.linalg.activations.IActivation) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ComputationGraphUpdater(org.deeplearning4j.nn.updater.graph.ComputationGraphUpdater) Updater(org.deeplearning4j.nn.api.Updater) IOutputLayer(org.deeplearning4j.nn.api.layers.IOutputLayer)

Aggregations

Gradient (org.deeplearning4j.nn.gradient.Gradient)105 INDArray (org.nd4j.linalg.api.ndarray.INDArray)100 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)72 Test (org.junit.Test)52 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)35 Pair (org.deeplearning4j.berkeley.Pair)28 Layer (org.deeplearning4j.nn.api.Layer)28 Updater (org.deeplearning4j.nn.api.Updater)25 DenseLayer (org.deeplearning4j.nn.conf.layers.DenseLayer)24 OutputLayer (org.deeplearning4j.nn.conf.layers.OutputLayer)21 MultiLayerConfiguration (org.deeplearning4j.nn.conf.MultiLayerConfiguration)9 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)8 IActivation (org.nd4j.linalg.activations.IActivation)6 HashMap (java.util.HashMap)5 DataSetIterator (org.nd4j.linalg.dataset.api.iterator.DataSetIterator)5 ArrayList (java.util.ArrayList)4 IrisDataSetIterator (org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator)4 DL4JInvalidInputException (org.deeplearning4j.exception.DL4JInvalidInputException)4 IOutputLayer (org.deeplearning4j.nn.api.layers.IOutputLayer)4 ComputationGraphConfiguration (org.deeplearning4j.nn.conf.ComputationGraphConfiguration)4