Search in sources :

Example 21 with Pair

use of org.deeplearning4j.berkeley.Pair in project deeplearning4j by deeplearning4j.

the class CudnnBatchNormalizationHelper method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray input, INDArray epsilon, int[] shape, INDArray gamma, INDArray dGammaView, INDArray dBetaView, double eps) {
    if (eps < CUDNN_BN_MIN_EPSILON) {
        throw new IllegalArgumentException("Error: eps < CUDNN_BN_MIN_EPSILON (" + eps + " < " + CUDNN_BN_MIN_EPSILON + ")");
    }
    int miniBatch = input.size(0);
    int depth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    Gradient retGradient = new DefaultGradient();
    if (!Shape.strideDescendingCAscendingF(epsilon)) {
        // apparently not supported by cuDNN
        epsilon = epsilon.dup();
    }
    int[] srcStride = input.stride();
    int[] deltaStride = epsilon.stride();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, depth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.deltaTensorDesc, dataType, miniBatch, depth, inH, inW, deltaStride[0], deltaStride[1], deltaStride[2], deltaStride[3]));
    INDArray nextEpsilon = Nd4j.createUninitialized(new int[] { miniBatch, depth, inH, inW }, 'c');
    int[] dstStride = nextEpsilon.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, depth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    int[] gammaStride = gamma.stride();
    checkCudnn(cudnnSetTensor4dDescriptor(cudnnContext.gammaBetaTensorDesc, tensorFormat, dataType, shape[0], shape[1], shape.length > 2 ? shape[2] : 1, shape.length > 3 ? shape[3] : 1));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareActionAllWrite(input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer epsData = allocator.getPointer(epsilon, context);
    Pointer dstData = allocator.getPointer(nextEpsilon, context);
    Pointer gammaData = allocator.getPointer(gamma, context);
    Pointer dGammaData = allocator.getPointer(dGammaView, context);
    Pointer dBetaData = allocator.getPointer(dBetaView, context);
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(cudnnBatchNormalizationBackward(cudnnContext, batchNormMode, alpha, beta, alpha, alpha, cudnnContext.srcTensorDesc, srcData, cudnnContext.deltaTensorDesc, epsData, cudnnContext.dstTensorDesc, dstData, cudnnContext.gammaBetaTensorDesc, gammaData, dGammaData, dBetaData, eps, meanCache, varCache));
    allocator.getFlowController().registerActionAllWrite(context, input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
    retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
    retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer) Pair(org.deeplearning4j.berkeley.Pair)

Example 22 with Pair

use of org.deeplearning4j.berkeley.Pair in project deeplearning4j by deeplearning4j.

the class BatchNormalization method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    INDArray nextEpsilon;
    int[] shape = getShape(epsilon);
    // number examples in batch
    int batchSize = epsilon.size(0);
    org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
    INDArray gamma = null;
    INDArray dGammaView;
    INDArray dBetaView;
    INDArray dGlobalMeanView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_MEAN);
    INDArray dGlobalVarView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_VAR);
    if (layerConf.isLockGammaBeta()) {
        int[] tempShape = new int[] { 1, shape[1] };
        dGammaView = Nd4j.createUninitialized(tempShape, 'c');
        dBetaView = Nd4j.createUninitialized(tempShape, 'c');
    } else {
        gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
        dGammaView = gradientViews.get(BatchNormalizationParamInitializer.GAMMA);
        dBetaView = gradientViews.get(BatchNormalizationParamInitializer.BETA);
    }
    Gradient retGradient = new DefaultGradient();
    if (helper != null && epsilon.rank() == 4) {
        //Note that cudnn does not support dense (2d) batch norm case as of v5.1
        if (layerConf.isLockGammaBeta()) {
            gamma = Nd4j.valueArrayOf(new int[] { 1, shape[1] }, layerConf.getGamma());
        }
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, shape, gamma, dGammaView, dBetaView, layerConf.getEps());
        if (ret != null) {
            return ret;
        }
    }
    if (epsilon.rank() == 2) {
        //TODO: handle fixed beta/gamma case...
        //dL/dGamma = sum_examples dL/dOut .* xHat
        INDArray dGamma = epsilon.mul(xHat).sum(0);
        //dL/dBeta = sum_examples dL/dOut
        INDArray dBeta = epsilon.sum(0);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            //dL/dxHat = dL/dOut . gamma        Shape: [minibatchSize, nOut]
            dxhat = epsilon.mulRowVector(gamma);
        }
        //dL/dVariance
        //Shape: [1, miniBatch]
        INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
        //Shape: [1, nOut]
        INDArray dLdmu = dxmu1.addi(dxmu2);
        //Note the array reuse here: dxhat, xMu, dLdVar, dLdmu - all are invalid after this line (but aren't used later anyway)
        INDArray dLdx = dxhat.diviRowVector(std).addi(xMu.muliRowVector(dLdVar.muli(2.0 / batchSize))).addiRowVector(dLdmu.muli(1.0 / batchSize));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else if (epsilon.rank() == 4) {
        INDArray dGamma = epsilon.mul(xHat).sum(0, 2, 3);
        INDArray dBeta = epsilon.sum(0, 2, 3);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            dxhat = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(epsilon, gamma, Nd4j.createUninitialized(epsilon.shape(), epsilon.ordering()), 1));
        }
        //dL/dVariance
        INDArray dLdVar = dxhat.mul(xMu).sum(0, 2, 3).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        int effectiveBatchSize = input.size(0) * input.size(2) * input.size(3);
        INDArray dxmu1 = dxhat.sum(0, 2, 3).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0, 2, 3).muli(-2.0 / effectiveBatchSize).muli(dLdVar);
        INDArray dLdmu = dxmu1.addi(dxmu2);
        INDArray dLdx = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(dxhat, std, dxhat, 1)).addi(Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xMu, dLdVar.muli(2.0 / effectiveBatchSize), xMu, 1)));
        Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(dLdx, dLdmu.muli(1.0 / effectiveBatchSize), dLdx, 1));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else {
        // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
        throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
    }
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) BroadcastDivOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastDivOp) BroadcastAddOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 23 with Pair

use of org.deeplearning4j.berkeley.Pair in project deeplearning4j by deeplearning4j.

the class LocalResponseNormalization method backpropGradient.

public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    if (helper != null) {
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, k, n, alpha, beta);
        if (ret != null) {
            return ret;
        }
    }
    int channel = input.size(1);
    INDArray tmp, addVal;
    Gradient retGradient = new DefaultGradient();
    INDArray reverse = activations.mul(epsilon);
    INDArray sumPart = reverse.dup();
    // sumPart = sum(a^j_{x,y} * gb^j_{x,y})
    for (int i = 1; i < halfN + 1; i++) {
        tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
        addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
        sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
        tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
        addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
        sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
    }
    // gx = gy * unitScale**-beta - 2 * alpha * beta * sumPart/unitScale * a^i_{x,y}    - rearranged for more in-place ops
    INDArray nextEpsilon = epsilon.mul(scale).subi(sumPart.muli(input).divi(unitScale).muli(2 * alpha * beta));
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) Pair(org.deeplearning4j.berkeley.Pair)

Example 24 with Pair

use of org.deeplearning4j.berkeley.Pair in project deeplearning4j by deeplearning4j.

the class SubsamplingLayer method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    int miniBatch = input.size(0);
    int inDepth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    int[] kernel = layerConf().getKernelSize();
    int[] strides = layerConf().getStride();
    int[] pad;
    int[] outSize;
    if (convolutionMode == ConvolutionMode.Same) {
        //Also performs validation
        outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, null, convolutionMode);
        pad = ConvolutionUtils.getSameModeTopLeftPadding(outSize, new int[] { inH, inW }, kernel, strides);
    } else {
        pad = layerConf().getPadding();
        //Also performs validation
        outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, pad, convolutionMode);
    }
    int outH = outSize[0];
    int outW = outSize[1];
    if (helper != null && Nd4j.dataType() != DataBuffer.Type.HALF) {
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, kernel, strides, pad, layerConf().getPoolingType(), convolutionMode);
        if (ret != null) {
            return ret;
        }
    }
    //subsampling doesn't have weights and thus gradients are not calculated for this layer
    //only scale and reshape epsilon
    int inputHeight = input().size(-2);
    int inputWidth = input().size(-1);
    Gradient retGradient = new DefaultGradient();
    //Epsilons in shape: [miniBatch, depth, outH, outW]
    //Epsilons out shape: [miniBatch, depth, inH, inW]
    //Two possibilities here for the epsilons:
    //(a) Epsilons come from a dense/output layer above, with c order and strides [depth*H*W, H*W, W, 1]
    //(b) Epsilons come from CNN layer above, with c order and strides [H*W, depth*H*W, W, 1] (i.e., due to permute)
    //We want to reshape epsilons to 1d here, but to do this without a copy: we end up with different orders of
    // element in the buffer, for the "dense above" and "cnn above" cases.
    //Fortunately, we can just permute things when we do the im2col reshaping; then, the order of the rows in
    // col2d will match the order of the 1d epsilons...
    //With the 1d epsilons order matching the rows order for the 2d im2col: we can just do a muliColumnVector op,
    // instead of a slower broadcast muli op
    boolean cOrderStrides = false;
    if (epsilon.ordering() != 'c') {
        epsilon = epsilon.dup('c');
        cOrderStrides = true;
    }
    if (!cOrderStrides && Shape.strideDescendingCAscendingF(epsilon)) {
        cOrderStrides = true;
    } else if (!Arrays.equals(new int[] { outH * outW, inDepth * outH * outW, outW, 1 }, epsilon.stride())) {
        //Unexpected/unusual strides, not either (a) or (b) cases above
        epsilon = epsilon.dup('c');
        cOrderStrides = true;
    }
    INDArray col6d;
    INDArray col6dPermuted;
    INDArray epsilon1d;
    if (cOrderStrides) {
        //"Dense/Output layer above strides... i.e., standard c-order strides
        col6d = Nd4j.create(new int[] { miniBatch, inDepth, outH, outW, kernel[0], kernel[1] }, 'c');
        col6dPermuted = col6d.permute(0, 1, 4, 5, 2, 3);
        //zero copy reshape
        epsilon1d = epsilon.reshape('c', ArrayUtil.prod(epsilon.length()), 1);
    } else {
        //"CNN layer above" strides...
        col6d = Nd4j.create(new int[] { inDepth, miniBatch, outH, outW, kernel[0], kernel[1] }, 'c');
        col6dPermuted = col6d.permute(1, 0, 4, 5, 2, 3);
        INDArray epsilonTemp = epsilon.permute(1, 0, 2, 3);
        //Should be a zero-copy reshape always
        epsilon1d = epsilonTemp.reshape('c', new int[] { ArrayUtil.prod(epsilon.length()), 1 });
    }
    INDArray col2d = col6d.reshape('c', miniBatch * inDepth * outH * outW, kernel[0] * kernel[1]);
    switch(layerConf().getPoolingType()) {
        case MAX:
            //Execute im2col, then reshape to 2d. Note rows are in a different order for cOrderStrides true vs false cases
            Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], convolutionMode == ConvolutionMode.Same, col6dPermuted);
            INDArray isMax = Nd4j.getExecutioner().execAndReturn(new IsMax(col2d, 1));
            isMax.muliColumnVector(epsilon1d);
            break;
        case AVG:
            //TODO: We could further optimize this by creating an uninitialized array, and doing a 'putiColumnVector' operation
            // instead of a zero initialization + an addiColumnVector op
            col2d.addiColumnVector(epsilon1d);
            break;
        case PNORM:
            int pnorm = layerConf().getPnorm();
            //First: do forward pass to get pNorm array
            Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], convolutionMode == ConvolutionMode.Same, col6dPermuted);
            //dup as we need col2d again later
            INDArray pNorm = Transforms.abs(col2d, true);
            Transforms.pow(pNorm, pnorm, false);
            pNorm = pNorm.sum(1);
            Transforms.pow(pNorm, (1.0 / pnorm), false);
            //dL/dIn = dL/dOut * dOut/dIn
            //dOut/dIn = in .* |in|^(p-2) /  ||in||_p^(p-1), where ||in||_p is the output p-norm
            INDArray numerator;
            if (pnorm == 2) {
                numerator = col2d;
            } else {
                INDArray absp2 = Transforms.pow(Transforms.abs(col2d, true), pnorm - 2, false);
                numerator = col2d.muli(absp2);
            }
            INDArray denom = Transforms.pow(pNorm, pnorm - 1, false);
            double eps = layerConf().getEps();
            // in case of 0
            Transforms.max(denom, eps, false);
            numerator.muliColumnVector(denom.rdivi(epsilon1d));
            break;
        case NONE:
            return new Pair<>(retGradient, epsilon);
        default:
            throw new IllegalStateException("Unknown or unsupported pooling type: " + layerConf().getPoolingType());
    }
    //Finally: we want the output strides for the epsilons to match the strides in the activations from the layer below
    //Assuming the layer below is a CNN layer (very likely) we want [H*W, depth*H*W, W, 1] instead of the standard
    // c-order [depth*H*W, H*W, W, 1] strides
    //To achieve this: [depth, miniBatch, H, W] in c order, then permute to [miniBatch, depth, H, W]
    //This gives us proper strides of 1 on the muli...
    INDArray tempEpsilon = Nd4j.create(new int[] { inDepth, miniBatch, inH, inW }, 'c');
    INDArray outEpsilon = tempEpsilon.permute(1, 0, 2, 3);
    Convolution.col2im(col6dPermuted, outEpsilon, strides[0], strides[1], pad[0], pad[1], inputHeight, inputWidth);
    if (layerConf().getPoolingType() == PoolingType.AVG)
        outEpsilon.divi(ArrayUtil.prod(layerConf().getKernelSize()));
    return new Pair<>(retGradient, outEpsilon);
}
Also used : IsMax(org.nd4j.linalg.api.ops.impl.transforms.IsMax) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 25 with Pair

use of org.deeplearning4j.berkeley.Pair in project deeplearning4j by deeplearning4j.

the class AutoEncoder method sampleHiddenGivenVisible.

@Override
public Pair<INDArray, INDArray> sampleHiddenGivenVisible(INDArray v) {
    setInput(v);
    INDArray ret = encode(v, true);
    return new Pair<>(ret, ret);
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Aggregations

Pair (org.deeplearning4j.berkeley.Pair)81 INDArray (org.nd4j.linalg.api.ndarray.INDArray)56 Gradient (org.deeplearning4j.nn.gradient.Gradient)28 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)25 ArrayList (java.util.ArrayList)8 DL4JInvalidInputException (org.deeplearning4j.exception.DL4JInvalidInputException)7 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)7 AtomicLong (java.util.concurrent.atomic.AtomicLong)5 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 Test (org.junit.Test)5 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)4 SparkTrainingStats (org.deeplearning4j.spark.api.stats.SparkTrainingStats)4 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)4 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)4 DoublePointer (org.bytedeco.javacpp.DoublePointer)3 FloatPointer (org.bytedeco.javacpp.FloatPointer)3 Pointer (org.bytedeco.javacpp.Pointer)3 ShortPointer (org.bytedeco.javacpp.ShortPointer)3 Counter (org.deeplearning4j.berkeley.Counter)3 InMemoryLookupTable (org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable)3