Search in sources :

Example 26 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class TestGradientNormalization method testL2ClippingPerLayer.

@Test
public void testL2ClippingPerLayer() {
    Nd4j.getRandom().setSeed(12345);
    double threshold = 3;
    for (int t = 0; t < 2; t++) {
        //t=0: small -> no clipping
        //t=1: large -> clipping
        NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(new DenseLayer.Builder().nIn(10).nOut(20).updater(org.deeplearning4j.nn.conf.Updater.NONE).gradientNormalization(GradientNormalization.ClipL2PerLayer).gradientNormalizationThreshold(threshold).build()).build();
        int numParams = conf.getLayer().initializer().numParams(conf);
        INDArray params = Nd4j.create(1, numParams);
        Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true);
        Updater updater = UpdaterCreator.getUpdater(layer);
        INDArray weightGrad = Nd4j.rand(10, 20).muli((t == 0 ? 0.05 : 10));
        INDArray biasGrad = Nd4j.rand(1, 10).muli((t == 0 ? 0.05 : 10));
        INDArray weightGradCopy = weightGrad.dup();
        INDArray biasGradCopy = biasGrad.dup();
        Gradient gradient = new DefaultGradient();
        gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
        gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
        double layerGradL2 = gradient.gradient().norm2Number().doubleValue();
        if (t == 0)
            assertTrue(layerGradL2 < threshold);
        else
            assertTrue(layerGradL2 > threshold);
        updater.update(layer, gradient, 0, 1);
        if (t == 0) {
            //norm2 < threshold -> no change
            assertEquals(weightGradCopy, weightGrad);
            assertEquals(biasGradCopy, biasGrad);
            continue;
        } else {
            //norm2 > threshold -> rescale
            assertNotEquals(weightGradCopy, weightGrad);
            assertNotEquals(biasGradCopy, biasGrad);
        }
        //for above threshold only...
        double scalingFactor = threshold / layerGradL2;
        INDArray expectedWeightGrad = weightGradCopy.mul(scalingFactor);
        INDArray expectedBiasGrad = biasGradCopy.mul(scalingFactor);
        assertEquals(expectedWeightGrad, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
        assertEquals(expectedBiasGrad, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
    }
}
Also used : DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Updater(org.deeplearning4j.nn.api.Updater) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) Layer(org.deeplearning4j.nn.api.Layer) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) Test(org.junit.Test)

Example 27 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class CudnnBatchNormalizationHelper method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray input, INDArray epsilon, int[] shape, INDArray gamma, INDArray dGammaView, INDArray dBetaView, double eps) {
    if (eps < CUDNN_BN_MIN_EPSILON) {
        throw new IllegalArgumentException("Error: eps < CUDNN_BN_MIN_EPSILON (" + eps + " < " + CUDNN_BN_MIN_EPSILON + ")");
    }
    int miniBatch = input.size(0);
    int depth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    Gradient retGradient = new DefaultGradient();
    if (!Shape.strideDescendingCAscendingF(epsilon)) {
        // apparently not supported by cuDNN
        epsilon = epsilon.dup();
    }
    int[] srcStride = input.stride();
    int[] deltaStride = epsilon.stride();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, depth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.deltaTensorDesc, dataType, miniBatch, depth, inH, inW, deltaStride[0], deltaStride[1], deltaStride[2], deltaStride[3]));
    INDArray nextEpsilon = Nd4j.createUninitialized(new int[] { miniBatch, depth, inH, inW }, 'c');
    int[] dstStride = nextEpsilon.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, depth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    int[] gammaStride = gamma.stride();
    checkCudnn(cudnnSetTensor4dDescriptor(cudnnContext.gammaBetaTensorDesc, tensorFormat, dataType, shape[0], shape[1], shape.length > 2 ? shape[2] : 1, shape.length > 3 ? shape[3] : 1));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareActionAllWrite(input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer epsData = allocator.getPointer(epsilon, context);
    Pointer dstData = allocator.getPointer(nextEpsilon, context);
    Pointer gammaData = allocator.getPointer(gamma, context);
    Pointer dGammaData = allocator.getPointer(dGammaView, context);
    Pointer dBetaData = allocator.getPointer(dBetaView, context);
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(cudnnBatchNormalizationBackward(cudnnContext, batchNormMode, alpha, beta, alpha, alpha, cudnnContext.srcTensorDesc, srcData, cudnnContext.deltaTensorDesc, epsData, cudnnContext.dstTensorDesc, dstData, cudnnContext.gammaBetaTensorDesc, gammaData, dGammaData, dBetaData, eps, meanCache, varCache));
    allocator.getFlowController().registerActionAllWrite(context, input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
    retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
    retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer) Pair(org.deeplearning4j.berkeley.Pair)

Example 28 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class TestConvolution method testCompareCudnnStandardOutputsVsMode.

@Test
public void testCompareCudnnStandardOutputsVsMode() throws Exception {
    ConvolutionMode[] cm = new ConvolutionMode[] { ConvolutionMode.Strict, ConvolutionMode.Same };
    for (ConvolutionMode c : cm) {
        for (boolean conv : new boolean[] { true, false }) {
            org.deeplearning4j.nn.conf.layers.Layer l;
            if (conv) {
                l = new ConvolutionLayer.Builder().nOut(4).kernelSize(4, 4).stride(2, 2).build();
            } else {
                l = new SubsamplingLayer.Builder().kernelSize(4, 4).stride(2, 2).build();
            }
            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345).regularization(true).l2(0.0005).learningRate(.01).weightInit(WeightInit.XAVIER).convolutionMode(c).list().layer(0, l).layer(1, new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD).nOut(10).activation(Activation.SOFTMAX).build()).setInputType(//See note below
            InputType.convolutionalFlat(28, 28, 1)).backprop(true).pretrain(false).build();
            Nd4j.getRandom().setSeed(12345);
            MultiLayerNetwork net1 = new MultiLayerNetwork(conf);
            net1.init();
            net1.initGradientsView();
            Nd4j.getRandom().setSeed(12345);
            MultiLayerNetwork net2 = new MultiLayerNetwork(conf);
            net2.init();
            net2.initGradientsView();
            Layer layerCudnn = net1.getLayer(0);
            Layer layerStandard = net2.getLayer(0);
            Field f = layerStandard.getClass().getDeclaredField("helper");
            f.setAccessible(true);
            f.set(layerStandard, null);
            if (f.get(layerCudnn) == null)
                throw new RuntimeException();
            if (f.get(layerStandard) != null)
                throw new RuntimeException();
            //(20-4+0)/2 +1 = 9
            INDArray in = Nd4j.rand(new int[] { 1, 1, 20, 20 });
            INDArray outCudnn = layerCudnn.activate(in);
            INDArray outStd = layerStandard.activate(in);
            assertEquals(outStd, outCudnn);
            //Check backprop:
            INDArray epsilon = Nd4j.rand(outStd.shape());
            Pair<Gradient, INDArray> pCudnn = layerCudnn.backpropGradient(epsilon);
            Pair<Gradient, INDArray> pStd = layerStandard.backpropGradient(epsilon);
            System.out.println(Arrays.toString(pStd.getSecond().data().asFloat()));
            System.out.println(Arrays.toString(pCudnn.getSecond().data().asFloat()));
            INDArray epsOutStd = pStd.getSecond();
            INDArray epsOutCudnn = pCudnn.getSecond();
            assertTrue(epsOutStd.equalsWithEps(epsOutCudnn, 1e-4));
            INDArray gradStd = pStd.getFirst().gradient();
            INDArray gradCudnn = pCudnn.getFirst().gradient();
            assertTrue(gradStd.equalsWithEps(gradCudnn, 1e-4));
        }
    }
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) ConvolutionLayer(org.deeplearning4j.nn.conf.layers.ConvolutionLayer) Layer(org.deeplearning4j.nn.api.Layer) OutputLayer(org.deeplearning4j.nn.conf.layers.OutputLayer) SubsamplingLayer(org.deeplearning4j.nn.conf.layers.SubsamplingLayer) ConvolutionLayer(org.deeplearning4j.nn.conf.layers.ConvolutionLayer) Field(java.lang.reflect.Field) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) INDArray(org.nd4j.linalg.api.ndarray.INDArray) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) ConvolutionMode(org.deeplearning4j.nn.conf.ConvolutionMode) Test(org.junit.Test)

Example 29 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class BatchNormalization method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    INDArray nextEpsilon;
    int[] shape = getShape(epsilon);
    // number examples in batch
    int batchSize = epsilon.size(0);
    org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
    INDArray gamma = null;
    INDArray dGammaView;
    INDArray dBetaView;
    INDArray dGlobalMeanView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_MEAN);
    INDArray dGlobalVarView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_VAR);
    if (layerConf.isLockGammaBeta()) {
        int[] tempShape = new int[] { 1, shape[1] };
        dGammaView = Nd4j.createUninitialized(tempShape, 'c');
        dBetaView = Nd4j.createUninitialized(tempShape, 'c');
    } else {
        gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
        dGammaView = gradientViews.get(BatchNormalizationParamInitializer.GAMMA);
        dBetaView = gradientViews.get(BatchNormalizationParamInitializer.BETA);
    }
    Gradient retGradient = new DefaultGradient();
    if (helper != null && epsilon.rank() == 4) {
        //Note that cudnn does not support dense (2d) batch norm case as of v5.1
        if (layerConf.isLockGammaBeta()) {
            gamma = Nd4j.valueArrayOf(new int[] { 1, shape[1] }, layerConf.getGamma());
        }
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, shape, gamma, dGammaView, dBetaView, layerConf.getEps());
        if (ret != null) {
            return ret;
        }
    }
    if (epsilon.rank() == 2) {
        //TODO: handle fixed beta/gamma case...
        //dL/dGamma = sum_examples dL/dOut .* xHat
        INDArray dGamma = epsilon.mul(xHat).sum(0);
        //dL/dBeta = sum_examples dL/dOut
        INDArray dBeta = epsilon.sum(0);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            //dL/dxHat = dL/dOut . gamma        Shape: [minibatchSize, nOut]
            dxhat = epsilon.mulRowVector(gamma);
        }
        //dL/dVariance
        //Shape: [1, miniBatch]
        INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
        //Shape: [1, nOut]
        INDArray dLdmu = dxmu1.addi(dxmu2);
        //Note the array reuse here: dxhat, xMu, dLdVar, dLdmu - all are invalid after this line (but aren't used later anyway)
        INDArray dLdx = dxhat.diviRowVector(std).addi(xMu.muliRowVector(dLdVar.muli(2.0 / batchSize))).addiRowVector(dLdmu.muli(1.0 / batchSize));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else if (epsilon.rank() == 4) {
        INDArray dGamma = epsilon.mul(xHat).sum(0, 2, 3);
        INDArray dBeta = epsilon.sum(0, 2, 3);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            dxhat = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(epsilon, gamma, Nd4j.createUninitialized(epsilon.shape(), epsilon.ordering()), 1));
        }
        //dL/dVariance
        INDArray dLdVar = dxhat.mul(xMu).sum(0, 2, 3).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        int effectiveBatchSize = input.size(0) * input.size(2) * input.size(3);
        INDArray dxmu1 = dxhat.sum(0, 2, 3).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0, 2, 3).muli(-2.0 / effectiveBatchSize).muli(dLdVar);
        INDArray dLdmu = dxmu1.addi(dxmu2);
        INDArray dLdx = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(dxhat, std, dxhat, 1)).addi(Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xMu, dLdVar.muli(2.0 / effectiveBatchSize), xMu, 1)));
        Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(dLdx, dLdmu.muli(1.0 / effectiveBatchSize), dLdx, 1));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else {
        // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
        throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
    }
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) BroadcastDivOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastDivOp) BroadcastAddOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 30 with Gradient

use of org.deeplearning4j.nn.gradient.Gradient in project deeplearning4j by deeplearning4j.

the class LocalResponseNormalization method backpropGradient.

public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    if (helper != null) {
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, k, n, alpha, beta);
        if (ret != null) {
            return ret;
        }
    }
    int channel = input.size(1);
    INDArray tmp, addVal;
    Gradient retGradient = new DefaultGradient();
    INDArray reverse = activations.mul(epsilon);
    INDArray sumPart = reverse.dup();
    // sumPart = sum(a^j_{x,y} * gb^j_{x,y})
    for (int i = 1; i < halfN + 1; i++) {
        tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
        addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
        sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
        tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
        addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
        sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
    }
    // gx = gy * unitScale**-beta - 2 * alpha * beta * sumPart/unitScale * a^i_{x,y}    - rearranged for more in-place ops
    INDArray nextEpsilon = epsilon.mul(scale).subi(sumPart.muli(input).divi(unitScale).muli(2 * alpha * beta));
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) Pair(org.deeplearning4j.berkeley.Pair)

Aggregations

Gradient (org.deeplearning4j.nn.gradient.Gradient)105 INDArray (org.nd4j.linalg.api.ndarray.INDArray)100 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)72 Test (org.junit.Test)52 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)35 Pair (org.deeplearning4j.berkeley.Pair)28 Layer (org.deeplearning4j.nn.api.Layer)28 Updater (org.deeplearning4j.nn.api.Updater)25 DenseLayer (org.deeplearning4j.nn.conf.layers.DenseLayer)24 OutputLayer (org.deeplearning4j.nn.conf.layers.OutputLayer)21 MultiLayerConfiguration (org.deeplearning4j.nn.conf.MultiLayerConfiguration)9 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)8 IActivation (org.nd4j.linalg.activations.IActivation)6 HashMap (java.util.HashMap)5 DataSetIterator (org.nd4j.linalg.dataset.api.iterator.DataSetIterator)5 ArrayList (java.util.ArrayList)4 IrisDataSetIterator (org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator)4 DL4JInvalidInputException (org.deeplearning4j.exception.DL4JInvalidInputException)4 IOutputLayer (org.deeplearning4j.nn.api.layers.IOutputLayer)4 ComputationGraphConfiguration (org.deeplearning4j.nn.conf.ComputationGraphConfiguration)4