use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class SubsamplingLayer method backpropGradient.
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
int miniBatch = input.size(0);
int inDepth = input.size(1);
int inH = input.size(2);
int inW = input.size(3);
int[] kernel = layerConf().getKernelSize();
int[] strides = layerConf().getStride();
int[] pad;
int[] outSize;
if (convolutionMode == ConvolutionMode.Same) {
//Also performs validation
outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, null, convolutionMode);
pad = ConvolutionUtils.getSameModeTopLeftPadding(outSize, new int[] { inH, inW }, kernel, strides);
} else {
pad = layerConf().getPadding();
//Also performs validation
outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, pad, convolutionMode);
}
int outH = outSize[0];
int outW = outSize[1];
if (helper != null && Nd4j.dataType() != DataBuffer.Type.HALF) {
Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, kernel, strides, pad, layerConf().getPoolingType(), convolutionMode);
if (ret != null) {
return ret;
}
}
//subsampling doesn't have weights and thus gradients are not calculated for this layer
//only scale and reshape epsilon
int inputHeight = input().size(-2);
int inputWidth = input().size(-1);
Gradient retGradient = new DefaultGradient();
//Epsilons in shape: [miniBatch, depth, outH, outW]
//Epsilons out shape: [miniBatch, depth, inH, inW]
//Two possibilities here for the epsilons:
//(a) Epsilons come from a dense/output layer above, with c order and strides [depth*H*W, H*W, W, 1]
//(b) Epsilons come from CNN layer above, with c order and strides [H*W, depth*H*W, W, 1] (i.e., due to permute)
//We want to reshape epsilons to 1d here, but to do this without a copy: we end up with different orders of
// element in the buffer, for the "dense above" and "cnn above" cases.
//Fortunately, we can just permute things when we do the im2col reshaping; then, the order of the rows in
// col2d will match the order of the 1d epsilons...
//With the 1d epsilons order matching the rows order for the 2d im2col: we can just do a muliColumnVector op,
// instead of a slower broadcast muli op
boolean cOrderStrides = false;
if (epsilon.ordering() != 'c') {
epsilon = epsilon.dup('c');
cOrderStrides = true;
}
if (!cOrderStrides && Shape.strideDescendingCAscendingF(epsilon)) {
cOrderStrides = true;
} else if (!Arrays.equals(new int[] { outH * outW, inDepth * outH * outW, outW, 1 }, epsilon.stride())) {
//Unexpected/unusual strides, not either (a) or (b) cases above
epsilon = epsilon.dup('c');
cOrderStrides = true;
}
INDArray col6d;
INDArray col6dPermuted;
INDArray epsilon1d;
if (cOrderStrides) {
//"Dense/Output layer above strides... i.e., standard c-order strides
col6d = Nd4j.create(new int[] { miniBatch, inDepth, outH, outW, kernel[0], kernel[1] }, 'c');
col6dPermuted = col6d.permute(0, 1, 4, 5, 2, 3);
//zero copy reshape
epsilon1d = epsilon.reshape('c', ArrayUtil.prod(epsilon.length()), 1);
} else {
//"CNN layer above" strides...
col6d = Nd4j.create(new int[] { inDepth, miniBatch, outH, outW, kernel[0], kernel[1] }, 'c');
col6dPermuted = col6d.permute(1, 0, 4, 5, 2, 3);
INDArray epsilonTemp = epsilon.permute(1, 0, 2, 3);
//Should be a zero-copy reshape always
epsilon1d = epsilonTemp.reshape('c', new int[] { ArrayUtil.prod(epsilon.length()), 1 });
}
INDArray col2d = col6d.reshape('c', miniBatch * inDepth * outH * outW, kernel[0] * kernel[1]);
switch(layerConf().getPoolingType()) {
case MAX:
//Execute im2col, then reshape to 2d. Note rows are in a different order for cOrderStrides true vs false cases
Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], convolutionMode == ConvolutionMode.Same, col6dPermuted);
INDArray isMax = Nd4j.getExecutioner().execAndReturn(new IsMax(col2d, 1));
isMax.muliColumnVector(epsilon1d);
break;
case AVG:
//TODO: We could further optimize this by creating an uninitialized array, and doing a 'putiColumnVector' operation
// instead of a zero initialization + an addiColumnVector op
col2d.addiColumnVector(epsilon1d);
break;
case PNORM:
int pnorm = layerConf().getPnorm();
//First: do forward pass to get pNorm array
Convolution.im2col(input, kernel[0], kernel[1], strides[0], strides[1], pad[0], pad[1], convolutionMode == ConvolutionMode.Same, col6dPermuted);
//dup as we need col2d again later
INDArray pNorm = Transforms.abs(col2d, true);
Transforms.pow(pNorm, pnorm, false);
pNorm = pNorm.sum(1);
Transforms.pow(pNorm, (1.0 / pnorm), false);
//dL/dIn = dL/dOut * dOut/dIn
//dOut/dIn = in .* |in|^(p-2) / ||in||_p^(p-1), where ||in||_p is the output p-norm
INDArray numerator;
if (pnorm == 2) {
numerator = col2d;
} else {
INDArray absp2 = Transforms.pow(Transforms.abs(col2d, true), pnorm - 2, false);
numerator = col2d.muli(absp2);
}
INDArray denom = Transforms.pow(pNorm, pnorm - 1, false);
double eps = layerConf().getEps();
// in case of 0
Transforms.max(denom, eps, false);
numerator.muliColumnVector(denom.rdivi(epsilon1d));
break;
case NONE:
return new Pair<>(retGradient, epsilon);
default:
throw new IllegalStateException("Unknown or unsupported pooling type: " + layerConf().getPoolingType());
}
//Finally: we want the output strides for the epsilons to match the strides in the activations from the layer below
//Assuming the layer below is a CNN layer (very likely) we want [H*W, depth*H*W, W, 1] instead of the standard
// c-order [depth*H*W, H*W, W, 1] strides
//To achieve this: [depth, miniBatch, H, W] in c order, then permute to [miniBatch, depth, H, W]
//This gives us proper strides of 1 on the muli...
INDArray tempEpsilon = Nd4j.create(new int[] { inDepth, miniBatch, inH, inW }, 'c');
INDArray outEpsilon = tempEpsilon.permute(1, 0, 2, 3);
Convolution.col2im(col6dPermuted, outEpsilon, strides[0], strides[1], pad[0], pad[1], inputHeight, inputWidth);
if (layerConf().getPoolingType() == PoolingType.AVG)
outEpsilon.divi(ArrayUtil.prod(layerConf().getKernelSize()));
return new Pair<>(retGradient, outEpsilon);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class EmbeddingLayer method backpropGradient.
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
//If this layer is layer L, then epsilon is (w^(L+1)*(d^(L+1))^T) (or equivalent)
INDArray z = preOutput(input);
//INDArray activationDerivative = Nd4j.getExecutioner().execAndReturn(Nd4j.getOpFactory().createTransform(conf().getLayer().getActivationFunction(), z).derivative());
// INDArray activationDerivative = conf().getLayer().getActivationFn().getGradient(z);
// INDArray delta = epsilon.muli(activationDerivative);
//TODO handle activation function params
INDArray delta = conf().getLayer().getActivationFn().backprop(z, epsilon).getFirst();
if (maskArray != null) {
delta.muliColumnVector(maskArray);
}
INDArray weights = getParam(DefaultParamInitializer.WEIGHT_KEY);
INDArray weightGradients = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY);
weightGradients.assign(0);
int[] indexes = new int[input.length()];
for (int i = 0; i < indexes.length; i++) {
indexes[i] = input.getInt(i, 0);
weightGradients.getRow(indexes[i]).addi(delta.getRow(i));
}
INDArray biasGradientsView = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
INDArray biasGradients = delta.sum(0);
//TODO do this without the assign...
biasGradientsView.assign(biasGradients);
Gradient ret = new DefaultGradient();
ret.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradients);
ret.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradientsView);
//Don't bother returning epsilons: no layer below this one...
return new Pair<>(ret, null);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class BaseOutputLayer method getGradientsAndDelta.
/** Returns tuple: {Gradient,Delta,Output} given preOut */
private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray preOut) {
ILossFunction lossFunction = layerConf().getLossFn();
INDArray labels2d = getLabels2d();
if (labels2d.size(1) != preOut.size(1)) {
throw new DL4JInvalidInputException("Labels array numColumns (size(1) = " + labels2d.size(1) + ") does not match output layer" + " number of outputs (nOut = " + preOut.size(1) + ")");
}
//INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFunction(), maskArray);
INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFn(), maskArray);
Gradient gradient = new DefaultGradient();
INDArray weightGradView = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY);
INDArray biasGradView = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
//Equivalent to: weightGradView.assign(input.transpose().mmul(delta));
Nd4j.gemm(input, delta, weightGradView, true, false, 1.0, 0.0);
biasGradView.assign(delta.sum(0));
gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradView);
gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradView);
return new Pair<>(gradient, delta);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class BasePretrainNetwork method createGradient.
protected Gradient createGradient(INDArray wGradient, INDArray vBiasGradient, INDArray hBiasGradient) {
Gradient ret = new DefaultGradient();
// The order of the following statements matters!! The gradient is being flattened and applied to
// flattened params in this order.
// The order might need to be handled via ordering
ret.gradientForVariable().put(PretrainParamInitializer.WEIGHT_KEY, wGradient);
ret.gradientForVariable().put(PretrainParamInitializer.BIAS_KEY, hBiasGradient);
ret.gradientForVariable().put(PretrainParamInitializer.VISIBLE_BIAS_KEY, vBiasGradient);
return ret;
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class LossLayer method getGradientsAndDelta.
/** Returns tuple: {Gradient,Delta,Output} given preOut */
private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray preOut) {
// delta calculation
ILossFunction lossFunction = layerConf().getLossFn();
INDArray delta = lossFunction.computeGradient(getLabels2d(), preOut, layerConf().getActivationFn(), maskArray);
// grab the empty gradient
Gradient gradient = new DefaultGradient();
return new Pair<>(gradient, delta);
}
Aggregations