Search in sources :

Example 11 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class TestDecayPolicies method testLearningRateScheduleSingleLayer.

@Test
public void testLearningRateScheduleSingleLayer() {
    Map<Integer, Double> learningRateAfter = new HashMap<>();
    learningRateAfter.put(1, 0.2);
    int iterations = 2;
    for (org.deeplearning4j.nn.conf.Updater updaterFunc : updaters) {
        double lr = 1e-2;
        NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr).learningRateSchedule(learningRateAfter).learningRateDecayPolicy(LearningRatePolicy.Schedule).iterations(iterations).layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).updater(updaterFunc).build()).build();
        int numParams = conf.getLayer().initializer().numParams(conf);
        INDArray params = Nd4j.create(1, numParams);
        Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true);
        Updater updater = UpdaterCreator.getUpdater(layer);
        int stateSize = updater.stateSizeForLayer(layer);
        if (stateSize > 0)
            updater.setStateViewArray(layer, Nd4j.create(1, stateSize), true);
        Gradient gradientActual = new DefaultGradient();
        gradientActual.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
        gradientActual.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());
        Gradient gradientExpected = new DefaultGradient();
        gradientExpected.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
        gradientExpected.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());
        for (int i = 0; i < 2; i++) {
            updater.update(layer, gradientActual, i, 1);
            if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.SGD))
                lr = testSGDComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
            else if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.ADAGRAD))
                lr = testAdaGradComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
            else if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.ADAM))
                lr = testAdamComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
            else if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.RMSPROP))
                lr = testRMSPropComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
            assertEquals(lr, layer.conf().getLearningRateByParam("W"), 1e-4);
        }
    }
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) HashMap(java.util.HashMap) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) Layer(org.deeplearning4j.nn.api.Layer) OutputLayer(org.deeplearning4j.nn.conf.layers.OutputLayer) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Updater(org.deeplearning4j.nn.api.Updater) Test(org.junit.Test)

Example 12 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class TestGradientNormalization method testL2ClippingPerLayer.

@Test
public void testL2ClippingPerLayer() {
    Nd4j.getRandom().setSeed(12345);
    double threshold = 3;
    for (int t = 0; t < 2; t++) {
        //t=0: small -> no clipping
        //t=1: large -> clipping
        NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(new DenseLayer.Builder().nIn(10).nOut(20).updater(org.deeplearning4j.nn.conf.Updater.NONE).gradientNormalization(GradientNormalization.ClipL2PerLayer).gradientNormalizationThreshold(threshold).build()).build();
        int numParams = conf.getLayer().initializer().numParams(conf);
        INDArray params = Nd4j.create(1, numParams);
        Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true);
        Updater updater = UpdaterCreator.getUpdater(layer);
        INDArray weightGrad = Nd4j.rand(10, 20).muli((t == 0 ? 0.05 : 10));
        INDArray biasGrad = Nd4j.rand(1, 10).muli((t == 0 ? 0.05 : 10));
        INDArray weightGradCopy = weightGrad.dup();
        INDArray biasGradCopy = biasGrad.dup();
        Gradient gradient = new DefaultGradient();
        gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
        gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
        double layerGradL2 = gradient.gradient().norm2Number().doubleValue();
        if (t == 0)
            assertTrue(layerGradL2 < threshold);
        else
            assertTrue(layerGradL2 > threshold);
        updater.update(layer, gradient, 0, 1);
        if (t == 0) {
            //norm2 < threshold -> no change
            assertEquals(weightGradCopy, weightGrad);
            assertEquals(biasGradCopy, biasGrad);
            continue;
        } else {
            //norm2 > threshold -> rescale
            assertNotEquals(weightGradCopy, weightGrad);
            assertNotEquals(biasGradCopy, biasGrad);
        }
        //for above threshold only...
        double scalingFactor = threshold / layerGradL2;
        INDArray expectedWeightGrad = weightGradCopy.mul(scalingFactor);
        INDArray expectedBiasGrad = biasGradCopy.mul(scalingFactor);
        assertEquals(expectedWeightGrad, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
        assertEquals(expectedBiasGrad, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
    }
}
Also used : DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Updater(org.deeplearning4j.nn.api.Updater) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) Layer(org.deeplearning4j.nn.api.Layer) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) Test(org.junit.Test)

Example 13 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class CudnnBatchNormalizationHelper method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray input, INDArray epsilon, int[] shape, INDArray gamma, INDArray dGammaView, INDArray dBetaView, double eps) {
    if (eps < CUDNN_BN_MIN_EPSILON) {
        throw new IllegalArgumentException("Error: eps < CUDNN_BN_MIN_EPSILON (" + eps + " < " + CUDNN_BN_MIN_EPSILON + ")");
    }
    int miniBatch = input.size(0);
    int depth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    Gradient retGradient = new DefaultGradient();
    if (!Shape.strideDescendingCAscendingF(epsilon)) {
        // apparently not supported by cuDNN
        epsilon = epsilon.dup();
    }
    int[] srcStride = input.stride();
    int[] deltaStride = epsilon.stride();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, depth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.deltaTensorDesc, dataType, miniBatch, depth, inH, inW, deltaStride[0], deltaStride[1], deltaStride[2], deltaStride[3]));
    INDArray nextEpsilon = Nd4j.createUninitialized(new int[] { miniBatch, depth, inH, inW }, 'c');
    int[] dstStride = nextEpsilon.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, depth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    int[] gammaStride = gamma.stride();
    checkCudnn(cudnnSetTensor4dDescriptor(cudnnContext.gammaBetaTensorDesc, tensorFormat, dataType, shape[0], shape[1], shape.length > 2 ? shape[2] : 1, shape.length > 3 ? shape[3] : 1));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareActionAllWrite(input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer epsData = allocator.getPointer(epsilon, context);
    Pointer dstData = allocator.getPointer(nextEpsilon, context);
    Pointer gammaData = allocator.getPointer(gamma, context);
    Pointer dGammaData = allocator.getPointer(dGammaView, context);
    Pointer dBetaData = allocator.getPointer(dBetaView, context);
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(cudnnBatchNormalizationBackward(cudnnContext, batchNormMode, alpha, beta, alpha, alpha, cudnnContext.srcTensorDesc, srcData, cudnnContext.deltaTensorDesc, epsData, cudnnContext.dstTensorDesc, dstData, cudnnContext.gammaBetaTensorDesc, gammaData, dGammaData, dBetaData, eps, meanCache, varCache));
    allocator.getFlowController().registerActionAllWrite(context, input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
    retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
    retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer) Pair(org.deeplearning4j.berkeley.Pair)

Example 14 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class BatchNormalization method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    INDArray nextEpsilon;
    int[] shape = getShape(epsilon);
    // number examples in batch
    int batchSize = epsilon.size(0);
    org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
    INDArray gamma = null;
    INDArray dGammaView;
    INDArray dBetaView;
    INDArray dGlobalMeanView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_MEAN);
    INDArray dGlobalVarView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_VAR);
    if (layerConf.isLockGammaBeta()) {
        int[] tempShape = new int[] { 1, shape[1] };
        dGammaView = Nd4j.createUninitialized(tempShape, 'c');
        dBetaView = Nd4j.createUninitialized(tempShape, 'c');
    } else {
        gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
        dGammaView = gradientViews.get(BatchNormalizationParamInitializer.GAMMA);
        dBetaView = gradientViews.get(BatchNormalizationParamInitializer.BETA);
    }
    Gradient retGradient = new DefaultGradient();
    if (helper != null && epsilon.rank() == 4) {
        //Note that cudnn does not support dense (2d) batch norm case as of v5.1
        if (layerConf.isLockGammaBeta()) {
            gamma = Nd4j.valueArrayOf(new int[] { 1, shape[1] }, layerConf.getGamma());
        }
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, shape, gamma, dGammaView, dBetaView, layerConf.getEps());
        if (ret != null) {
            return ret;
        }
    }
    if (epsilon.rank() == 2) {
        //TODO: handle fixed beta/gamma case...
        //dL/dGamma = sum_examples dL/dOut .* xHat
        INDArray dGamma = epsilon.mul(xHat).sum(0);
        //dL/dBeta = sum_examples dL/dOut
        INDArray dBeta = epsilon.sum(0);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            //dL/dxHat = dL/dOut . gamma        Shape: [minibatchSize, nOut]
            dxhat = epsilon.mulRowVector(gamma);
        }
        //dL/dVariance
        //Shape: [1, miniBatch]
        INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
        //Shape: [1, nOut]
        INDArray dLdmu = dxmu1.addi(dxmu2);
        //Note the array reuse here: dxhat, xMu, dLdVar, dLdmu - all are invalid after this line (but aren't used later anyway)
        INDArray dLdx = dxhat.diviRowVector(std).addi(xMu.muliRowVector(dLdVar.muli(2.0 / batchSize))).addiRowVector(dLdmu.muli(1.0 / batchSize));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else if (epsilon.rank() == 4) {
        INDArray dGamma = epsilon.mul(xHat).sum(0, 2, 3);
        INDArray dBeta = epsilon.sum(0, 2, 3);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            dxhat = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(epsilon, gamma, Nd4j.createUninitialized(epsilon.shape(), epsilon.ordering()), 1));
        }
        //dL/dVariance
        INDArray dLdVar = dxhat.mul(xMu).sum(0, 2, 3).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        int effectiveBatchSize = input.size(0) * input.size(2) * input.size(3);
        INDArray dxmu1 = dxhat.sum(0, 2, 3).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0, 2, 3).muli(-2.0 / effectiveBatchSize).muli(dLdVar);
        INDArray dLdmu = dxmu1.addi(dxmu2);
        INDArray dLdx = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(dxhat, std, dxhat, 1)).addi(Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xMu, dLdVar.muli(2.0 / effectiveBatchSize), xMu, 1)));
        Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(dLdx, dLdmu.muli(1.0 / effectiveBatchSize), dLdx, 1));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else {
        // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
        throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
    }
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) BroadcastDivOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastDivOp) BroadcastAddOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 15 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class LocalResponseNormalization method backpropGradient.

public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    if (helper != null) {
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, k, n, alpha, beta);
        if (ret != null) {
            return ret;
        }
    }
    int channel = input.size(1);
    INDArray tmp, addVal;
    Gradient retGradient = new DefaultGradient();
    INDArray reverse = activations.mul(epsilon);
    INDArray sumPart = reverse.dup();
    // sumPart = sum(a^j_{x,y} * gb^j_{x,y})
    for (int i = 1; i < halfN + 1; i++) {
        tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
        addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
        sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
        tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
        addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
        sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
    }
    // gx = gy * unitScale**-beta - 2 * alpha * beta * sumPart/unitScale * a^i_{x,y}    - rearranged for more in-place ops
    INDArray nextEpsilon = epsilon.mul(scale).subi(sumPart.muli(input).divi(unitScale).muli(2 * alpha * beta));
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) Pair(org.deeplearning4j.berkeley.Pair)

Aggregations

DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)59 Gradient (org.deeplearning4j.nn.gradient.Gradient)58 INDArray (org.nd4j.linalg.api.ndarray.INDArray)56 Test (org.junit.Test)26 Pair (org.deeplearning4j.berkeley.Pair)23 Updater (org.deeplearning4j.nn.api.Updater)23 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)22 DenseLayer (org.deeplearning4j.nn.conf.layers.DenseLayer)22 Layer (org.deeplearning4j.nn.api.Layer)20 OutputLayer (org.deeplearning4j.nn.conf.layers.OutputLayer)16 HashMap (java.util.HashMap)5 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)4 Allocator (org.nd4j.jita.allocator.Allocator)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 IActivation (org.nd4j.linalg.activations.IActivation)4 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)4 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)4 Map (java.util.Map)3 DoublePointer (org.bytedeco.javacpp.DoublePointer)3 FloatPointer (org.bytedeco.javacpp.FloatPointer)3