Search in sources :

Example 16 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class SubsamplingLayer method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    int miniBatch = input.size(0);
    int inDepth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    int[] kernel = layerConf().getKernelSize();
    int[] strides = layerConf().getStride();
    int[] pad;
    int[] outSize;
    if (convolutionMode == ConvolutionMode.Same) {
        //Also performs validation
        outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, null, convolutionMode);
        pad = ConvolutionUtils.getSameModeTopLeftPadding(outSize, new int[] { inH, inW }, kernel, strides);
    } else {
        pad = layerConf().getPadding();
        //Also performs validation
        outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, pad, convolutionMode);
    }
    int outH = outSize[0];
    int outW = outSize[1];
    if (helper != null && Nd4j.dataType() != DataBuffer.Type.HALF) {
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, kernel, strides, pad, layerConf().getPoolingType(), convolutionMode);
        if (ret != null) {
            return ret;
        }
    }
    //subsampling doesn't have weights and thus gradients are not calculated for this layer
    //only scale and reshape epsilon
    int inputHeight = input().size(-2);
    int inputWidth = input().size(-1);
    Gradient retGradient = new DefaultGradient();
    //Epsilons in shape: [miniBatch, depth, outH, outW]
    //Epsilons out shape: [miniBatch, depth, inH, inW]
    //Two possibilities here for the epsilons:
    //(a) Epsilons come from a dense/output layer above, with c order and strides [depth*H*W, H*W, W, 1]
    //(b) Epsilons come from CNN layer above, with c order and strides [H*W, depth*H*W, W, 1] (i.e., due to permute)
    //We want to reshape epsilons to 1d here, but to do this without a copy: we end up with different orders of
    // element in the buffer, for the "dense above" and "cnn above" cases.
    //Fortunately, we can just permute things when we do the im2col reshaping; then, the order of the rows in
    // col2d will match the order of the 1d epsilons...
    //With the 1d epsilons order matching the rows order for the 2d im2col: we can just do a muliColumnVector op,
    // instead of a slower broadcast muli op
    boolean cOrderStrides = false;
    if (epsilon.ordering() != 'c') {
        epsilon = epsilon.dup('c');
        cOrderStrides = true;
    }
    if (!cOrderStrides && Shape.strideDescendingCAscendingF(epsilon)) {
        cOrderStrides = true;
    } else if (!Arrays.equals(new int[] { outH * outW, inDepth * outH * outW, outW, 1 }, epsilon.stride())) {
        //Unexpected/unusual strides, not either (a) or (b) cases above
        epsilon = epsilon.dup('c');
        cOrderStrides = true;
    }
    INDArray col6d;
    INDArray col6dPermuted;
    INDArray epsilon1d;
    if (cOrderStrides) {
        //"Dense/Output layer above strides... i.e., standard c-order strides
        col6d = Nd4j.create(new int[] { miniBatch, inDepth, outH, outW, kernel[0], kernel[1] }, 'c');
        col6dPermuted = col6d.permute(0, 1, 4, 5, 2, 3);
        //zero copy reshape
        epsilon1d = epsilon.reshape('c', ArrayUtil.prod(epsilon.length()), 1);
    } else {
        //"CNN layer above" strides...
        col6d = Nd4j.create(new int[] { inDepth, miniBatch, outH, outW, kernel[0], kernel[1] }, 'c');
        col6dPermuted = col6d.permute(1, 0, 4, 5, 2, 3);
        INDArray epsilonTemp = epsilon.permute(1, 0, 2, 3);
        //Should be a zero-copy reshape always
        epsilon1d = epsilonTemp.reshape('c', new int[] { ArrayUtil.prod(epsilon.length()), 1 });
    }
    INDArray col2d = col6d.reshape('c', miniBatch * inDepth * outH * outW, kernel[0] * kernel[1]);
    switch(layerConf().getPoolingType()) {
        case MAX:
            //Execute im2col, then reshape to 2d. Note rows are in a different order for cOrderStrides true vs false cases
            Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], convolutionMode == ConvolutionMode.Same, col6dPermuted);
            INDArray isMax = Nd4j.getExecutioner().execAndReturn(new IsMax(col2d, 1));
            isMax.muliColumnVector(epsilon1d);
            break;
        case AVG:
            //TODO: We could further optimize this by creating an uninitialized array, and doing a 'putiColumnVector' operation
            // instead of a zero initialization + an addiColumnVector op
            col2d.addiColumnVector(epsilon1d);
            break;
        case PNORM:
            int pnorm = layerConf().getPnorm();
            //First: do forward pass to get pNorm array
            Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], convolutionMode == ConvolutionMode.Same, col6dPermuted);
            //dup as we need col2d again later
            INDArray pNorm = Transforms.abs(col2d, true);
            Transforms.pow(pNorm, pnorm, false);
            pNorm = pNorm.sum(1);
            Transforms.pow(pNorm, (1.0 / pnorm), false);
            //dL/dIn = dL/dOut * dOut/dIn
            //dOut/dIn = in .* |in|^(p-2) /  ||in||_p^(p-1), where ||in||_p is the output p-norm
            INDArray numerator;
            if (pnorm == 2) {
                numerator = col2d;
            } else {
                INDArray absp2 = Transforms.pow(Transforms.abs(col2d, true), pnorm - 2, false);
                numerator = col2d.muli(absp2);
            }
            INDArray denom = Transforms.pow(pNorm, pnorm - 1, false);
            double eps = layerConf().getEps();
            // in case of 0
            Transforms.max(denom, eps, false);
            numerator.muliColumnVector(denom.rdivi(epsilon1d));
            break;
        case NONE:
            return new Pair<>(retGradient, epsilon);
        default:
            throw new IllegalStateException("Unknown or unsupported pooling type: " + layerConf().getPoolingType());
    }
    //Finally: we want the output strides for the epsilons to match the strides in the activations from the layer below
    //Assuming the layer below is a CNN layer (very likely) we want [H*W, depth*H*W, W, 1] instead of the standard
    // c-order [depth*H*W, H*W, W, 1] strides
    //To achieve this: [depth, miniBatch, H, W] in c order, then permute to [miniBatch, depth, H, W]
    //This gives us proper strides of 1 on the muli...
    INDArray tempEpsilon = Nd4j.create(new int[] { inDepth, miniBatch, inH, inW }, 'c');
    INDArray outEpsilon = tempEpsilon.permute(1, 0, 2, 3);
    Convolution.col2im(col6dPermuted, outEpsilon, strides[0], strides[1], pad[0], pad[1], inputHeight, inputWidth);
    if (layerConf().getPoolingType() == PoolingType.AVG)
        outEpsilon.divi(ArrayUtil.prod(layerConf().getKernelSize()));
    return new Pair<>(retGradient, outEpsilon);
}
Also used : IsMax(org.nd4j.linalg.api.ops.impl.transforms.IsMax) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 17 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class EmbeddingLayer method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    //If this layer is layer L, then epsilon is (w^(L+1)*(d^(L+1))^T) (or equivalent)
    INDArray z = preOutput(input);
    //INDArray activationDerivative = Nd4j.getExecutioner().execAndReturn(Nd4j.getOpFactory().createTransform(conf().getLayer().getActivationFunction(), z).derivative());
    //        INDArray activationDerivative = conf().getLayer().getActivationFn().getGradient(z);
    //        INDArray delta = epsilon.muli(activationDerivative);
    //TODO handle activation function params
    INDArray delta = conf().getLayer().getActivationFn().backprop(z, epsilon).getFirst();
    if (maskArray != null) {
        delta.muliColumnVector(maskArray);
    }
    INDArray weights = getParam(DefaultParamInitializer.WEIGHT_KEY);
    INDArray weightGradients = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY);
    weightGradients.assign(0);
    int[] indexes = new int[input.length()];
    for (int i = 0; i < indexes.length; i++) {
        indexes[i] = input.getInt(i, 0);
        weightGradients.getRow(indexes[i]).addi(delta.getRow(i));
    }
    INDArray biasGradientsView = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
    INDArray biasGradients = delta.sum(0);
    //TODO do this without the assign...
    biasGradientsView.assign(biasGradients);
    Gradient ret = new DefaultGradient();
    ret.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradients);
    ret.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradientsView);
    //Don't bother returning epsilons: no layer below this one...
    return new Pair<>(ret, null);
}
Also used : DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 18 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class BaseOutputLayer method getGradientsAndDelta.

/** Returns tuple: {Gradient,Delta,Output} given preOut */
private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray preOut) {
    ILossFunction lossFunction = layerConf().getLossFn();
    INDArray labels2d = getLabels2d();
    if (labels2d.size(1) != preOut.size(1)) {
        throw new DL4JInvalidInputException("Labels array numColumns (size(1) = " + labels2d.size(1) + ") does not match output layer" + " number of outputs (nOut = " + preOut.size(1) + ")");
    }
    //INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFunction(), maskArray);
    INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFn(), maskArray);
    Gradient gradient = new DefaultGradient();
    INDArray weightGradView = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY);
    INDArray biasGradView = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
    //Equivalent to:  weightGradView.assign(input.transpose().mmul(delta));
    Nd4j.gemm(input, delta, weightGradView, true, false, 1.0, 0.0);
    biasGradView.assign(delta.sum(0));
    gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradView);
    gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradView);
    return new Pair<>(gradient, delta);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ILossFunction(org.nd4j.linalg.lossfunctions.ILossFunction) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) Pair(org.deeplearning4j.berkeley.Pair)

Example 19 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class BasePretrainNetwork method createGradient.

protected Gradient createGradient(INDArray wGradient, INDArray vBiasGradient, INDArray hBiasGradient) {
    Gradient ret = new DefaultGradient();
    // The order of the following statements matters!! The gradient is being flattened and applied to
    // flattened params in this order.
    // The order might need to be handled via ordering
    ret.gradientForVariable().put(PretrainParamInitializer.WEIGHT_KEY, wGradient);
    ret.gradientForVariable().put(PretrainParamInitializer.BIAS_KEY, hBiasGradient);
    ret.gradientForVariable().put(PretrainParamInitializer.VISIBLE_BIAS_KEY, vBiasGradient);
    return ret;
}
Also used : DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient)

Example 20 with DefaultGradient

use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.

the class LossLayer method getGradientsAndDelta.

/** Returns tuple: {Gradient,Delta,Output} given preOut */
private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray preOut) {
    // delta calculation
    ILossFunction lossFunction = layerConf().getLossFn();
    INDArray delta = lossFunction.computeGradient(getLabels2d(), preOut, layerConf().getActivationFn(), maskArray);
    // grab the empty gradient
    Gradient gradient = new DefaultGradient();
    return new Pair<>(gradient, delta);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ILossFunction(org.nd4j.linalg.lossfunctions.ILossFunction) Pair(org.deeplearning4j.berkeley.Pair)

Aggregations

DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)59 Gradient (org.deeplearning4j.nn.gradient.Gradient)58 INDArray (org.nd4j.linalg.api.ndarray.INDArray)56 Test (org.junit.Test)26 Pair (org.deeplearning4j.berkeley.Pair)23 Updater (org.deeplearning4j.nn.api.Updater)23 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)22 DenseLayer (org.deeplearning4j.nn.conf.layers.DenseLayer)22 Layer (org.deeplearning4j.nn.api.Layer)20 OutputLayer (org.deeplearning4j.nn.conf.layers.OutputLayer)16 HashMap (java.util.HashMap)5 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)4 Allocator (org.nd4j.jita.allocator.Allocator)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 IActivation (org.nd4j.linalg.activations.IActivation)4 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)4 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)4 Map (java.util.Map)3 DoublePointer (org.bytedeco.javacpp.DoublePointer)3 FloatPointer (org.bytedeco.javacpp.FloatPointer)3