use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class TestDecayPolicies method testLearningRateScheduleSingleLayer.
@Test
public void testLearningRateScheduleSingleLayer() {
Map<Integer, Double> learningRateAfter = new HashMap<>();
learningRateAfter.put(1, 0.2);
int iterations = 2;
for (org.deeplearning4j.nn.conf.Updater updaterFunc : updaters) {
double lr = 1e-2;
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().learningRate(lr).learningRateSchedule(learningRateAfter).learningRateDecayPolicy(LearningRatePolicy.Schedule).iterations(iterations).layer(new DenseLayer.Builder().nIn(nIn).nOut(nOut).updater(updaterFunc).build()).build();
int numParams = conf.getLayer().initializer().numParams(conf);
INDArray params = Nd4j.create(1, numParams);
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true);
Updater updater = UpdaterCreator.getUpdater(layer);
int stateSize = updater.stateSizeForLayer(layer);
if (stateSize > 0)
updater.setStateViewArray(layer, Nd4j.create(1, stateSize), true);
Gradient gradientActual = new DefaultGradient();
gradientActual.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
gradientActual.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());
Gradient gradientExpected = new DefaultGradient();
gradientExpected.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
gradientExpected.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());
for (int i = 0; i < 2; i++) {
updater.update(layer, gradientActual, i, 1);
if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.SGD))
lr = testSGDComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
else if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.ADAGRAD))
lr = testAdaGradComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
else if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.ADAM))
lr = testAdamComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
else if (updaterFunc.equals(org.deeplearning4j.nn.conf.Updater.RMSPROP))
lr = testRMSPropComputation(gradientActual, gradientExpected, lr, learningRateAfter, i);
assertEquals(lr, layer.conf().getLearningRateByParam("W"), 1e-4);
}
}
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class TestGradientNormalization method testL2ClippingPerLayer.
@Test
public void testL2ClippingPerLayer() {
Nd4j.getRandom().setSeed(12345);
double threshold = 3;
for (int t = 0; t < 2; t++) {
//t=0: small -> no clipping
//t=1: large -> clipping
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(new DenseLayer.Builder().nIn(10).nOut(20).updater(org.deeplearning4j.nn.conf.Updater.NONE).gradientNormalization(GradientNormalization.ClipL2PerLayer).gradientNormalizationThreshold(threshold).build()).build();
int numParams = conf.getLayer().initializer().numParams(conf);
INDArray params = Nd4j.create(1, numParams);
Layer layer = conf.getLayer().instantiate(conf, null, 0, params, true);
Updater updater = UpdaterCreator.getUpdater(layer);
INDArray weightGrad = Nd4j.rand(10, 20).muli((t == 0 ? 0.05 : 10));
INDArray biasGrad = Nd4j.rand(1, 10).muli((t == 0 ? 0.05 : 10));
INDArray weightGradCopy = weightGrad.dup();
INDArray biasGradCopy = biasGrad.dup();
Gradient gradient = new DefaultGradient();
gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGrad);
gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGrad);
double layerGradL2 = gradient.gradient().norm2Number().doubleValue();
if (t == 0)
assertTrue(layerGradL2 < threshold);
else
assertTrue(layerGradL2 > threshold);
updater.update(layer, gradient, 0, 1);
if (t == 0) {
//norm2 < threshold -> no change
assertEquals(weightGradCopy, weightGrad);
assertEquals(biasGradCopy, biasGrad);
continue;
} else {
//norm2 > threshold -> rescale
assertNotEquals(weightGradCopy, weightGrad);
assertNotEquals(biasGradCopy, biasGrad);
}
//for above threshold only...
double scalingFactor = threshold / layerGradL2;
INDArray expectedWeightGrad = weightGradCopy.mul(scalingFactor);
INDArray expectedBiasGrad = biasGradCopy.mul(scalingFactor);
assertEquals(expectedWeightGrad, gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY));
assertEquals(expectedBiasGrad, gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY));
}
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class CudnnBatchNormalizationHelper method backpropGradient.
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray input, INDArray epsilon, int[] shape, INDArray gamma, INDArray dGammaView, INDArray dBetaView, double eps) {
if (eps < CUDNN_BN_MIN_EPSILON) {
throw new IllegalArgumentException("Error: eps < CUDNN_BN_MIN_EPSILON (" + eps + " < " + CUDNN_BN_MIN_EPSILON + ")");
}
int miniBatch = input.size(0);
int depth = input.size(1);
int inH = input.size(2);
int inW = input.size(3);
Gradient retGradient = new DefaultGradient();
if (!Shape.strideDescendingCAscendingF(epsilon)) {
// apparently not supported by cuDNN
epsilon = epsilon.dup();
}
int[] srcStride = input.stride();
int[] deltaStride = epsilon.stride();
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, depth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.deltaTensorDesc, dataType, miniBatch, depth, inH, inW, deltaStride[0], deltaStride[1], deltaStride[2], deltaStride[3]));
INDArray nextEpsilon = Nd4j.createUninitialized(new int[] { miniBatch, depth, inH, inW }, 'c');
int[] dstStride = nextEpsilon.stride();
checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, depth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
int[] gammaStride = gamma.stride();
checkCudnn(cudnnSetTensor4dDescriptor(cudnnContext.gammaBetaTensorDesc, tensorFormat, dataType, shape[0], shape[1], shape.length > 2 ? shape[2] : 1, shape.length > 3 ? shape[3] : 1));
Allocator allocator = AtomicAllocator.getInstance();
CudaContext context = allocator.getFlowController().prepareActionAllWrite(input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
Pointer srcData = allocator.getPointer(input, context);
Pointer epsData = allocator.getPointer(epsilon, context);
Pointer dstData = allocator.getPointer(nextEpsilon, context);
Pointer gammaData = allocator.getPointer(gamma, context);
Pointer dGammaData = allocator.getPointer(dGammaView, context);
Pointer dBetaData = allocator.getPointer(dBetaView, context);
checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
checkCudnn(cudnnBatchNormalizationBackward(cudnnContext, batchNormMode, alpha, beta, alpha, alpha, cudnnContext.srcTensorDesc, srcData, cudnnContext.deltaTensorDesc, epsData, cudnnContext.dstTensorDesc, dstData, cudnnContext.gammaBetaTensorDesc, gammaData, dGammaData, dBetaData, eps, meanCache, varCache));
allocator.getFlowController().registerActionAllWrite(context, input, epsilon, nextEpsilon, gamma, dGammaView, dBetaView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
return new Pair<>(retGradient, nextEpsilon);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class BatchNormalization method backpropGradient.
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
INDArray nextEpsilon;
int[] shape = getShape(epsilon);
// number examples in batch
int batchSize = epsilon.size(0);
org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
INDArray gamma = null;
INDArray dGammaView;
INDArray dBetaView;
INDArray dGlobalMeanView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_MEAN);
INDArray dGlobalVarView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_VAR);
if (layerConf.isLockGammaBeta()) {
int[] tempShape = new int[] { 1, shape[1] };
dGammaView = Nd4j.createUninitialized(tempShape, 'c');
dBetaView = Nd4j.createUninitialized(tempShape, 'c');
} else {
gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
dGammaView = gradientViews.get(BatchNormalizationParamInitializer.GAMMA);
dBetaView = gradientViews.get(BatchNormalizationParamInitializer.BETA);
}
Gradient retGradient = new DefaultGradient();
if (helper != null && epsilon.rank() == 4) {
//Note that cudnn does not support dense (2d) batch norm case as of v5.1
if (layerConf.isLockGammaBeta()) {
gamma = Nd4j.valueArrayOf(new int[] { 1, shape[1] }, layerConf.getGamma());
}
Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, shape, gamma, dGammaView, dBetaView, layerConf.getEps());
if (ret != null) {
return ret;
}
}
if (epsilon.rank() == 2) {
//TODO: handle fixed beta/gamma case...
//dL/dGamma = sum_examples dL/dOut .* xHat
INDArray dGamma = epsilon.mul(xHat).sum(0);
//dL/dBeta = sum_examples dL/dOut
INDArray dBeta = epsilon.sum(0);
INDArray dxhat;
if (layerConf.isLockGammaBeta()) {
dxhat = epsilon.mul(layerConf.getGamma());
} else {
//Standard case
//dL/dxHat = dL/dOut . gamma Shape: [minibatchSize, nOut]
dxhat = epsilon.mulRowVector(gamma);
}
//dL/dVariance
//Shape: [1, miniBatch]
INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
//dL/dmu
INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
//Shape: [1, nOut]
INDArray dLdmu = dxmu1.addi(dxmu2);
//Note the array reuse here: dxhat, xMu, dLdVar, dLdmu - all are invalid after this line (but aren't used later anyway)
INDArray dLdx = dxhat.diviRowVector(std).addi(xMu.muliRowVector(dLdVar.muli(2.0 / batchSize))).addiRowVector(dLdmu.muli(1.0 / batchSize));
//TODO rework this to avoid the assign here
dGammaView.assign(dGamma);
dBetaView.assign(dBeta);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
//TODO: do this properly
dGlobalMeanView.assign(0);
dGlobalVarView.assign(0);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
nextEpsilon = dLdx;
} else if (epsilon.rank() == 4) {
INDArray dGamma = epsilon.mul(xHat).sum(0, 2, 3);
INDArray dBeta = epsilon.sum(0, 2, 3);
INDArray dxhat;
if (layerConf.isLockGammaBeta()) {
dxhat = epsilon.mul(layerConf.getGamma());
} else {
//Standard case
dxhat = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(epsilon, gamma, Nd4j.createUninitialized(epsilon.shape(), epsilon.ordering()), 1));
}
//dL/dVariance
INDArray dLdVar = dxhat.mul(xMu).sum(0, 2, 3).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
//dL/dmu
int effectiveBatchSize = input.size(0) * input.size(2) * input.size(3);
INDArray dxmu1 = dxhat.sum(0, 2, 3).divi(std).negi();
INDArray dxmu2 = xMu.sum(0, 2, 3).muli(-2.0 / effectiveBatchSize).muli(dLdVar);
INDArray dLdmu = dxmu1.addi(dxmu2);
INDArray dLdx = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(dxhat, std, dxhat, 1)).addi(Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xMu, dLdVar.muli(2.0 / effectiveBatchSize), xMu, 1)));
Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(dLdx, dLdmu.muli(1.0 / effectiveBatchSize), dLdx, 1));
//TODO rework this to avoid the assign here
dGammaView.assign(dGamma);
dBetaView.assign(dBeta);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
//TODO: do this properly
dGlobalMeanView.assign(0);
dGlobalVarView.assign(0);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
nextEpsilon = dLdx;
} else {
// TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
}
return new Pair<>(retGradient, nextEpsilon);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class LocalResponseNormalization method backpropGradient.
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
if (helper != null) {
Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, k, n, alpha, beta);
if (ret != null) {
return ret;
}
}
int channel = input.size(1);
INDArray tmp, addVal;
Gradient retGradient = new DefaultGradient();
INDArray reverse = activations.mul(epsilon);
INDArray sumPart = reverse.dup();
// sumPart = sum(a^j_{x,y} * gb^j_{x,y})
for (int i = 1; i < halfN + 1; i++) {
tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
tmp = sumPart.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() });
addVal = reverse.get(new INDArrayIndex[] { NDArrayIndex.all(), interval(i, channel), NDArrayIndex.all(), NDArrayIndex.all() });
sumPart.put(new INDArrayIndex[] { NDArrayIndex.all(), interval(0, channel - i), NDArrayIndex.all(), NDArrayIndex.all() }, tmp.addi(addVal));
}
// gx = gy * unitScale**-beta - 2 * alpha * beta * sumPart/unitScale * a^i_{x,y} - rearranged for more in-place ops
INDArray nextEpsilon = epsilon.mul(scale).subi(sumPart.muli(input).divi(unitScale).muli(2 * alpha * beta));
return new Pair<>(retGradient, nextEpsilon);
}
Aggregations