use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.
the class BatchNormalization method preOutput.
public INDArray preOutput(INDArray x, TrainingMode training) {
INDArray activations;
// TODO add this directly in layer or get the layer prior...
// batchnorm true but need to clarify if activation before or after
org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
int[] shape = getShape(x);
// xHat = (x-xmean) / sqrt(var + epsilon)
//Note that for CNNs, mean and variance are calculated per feature map (i.e., per activation) rather than per activation
//Pg5 of http://arxiv.org/pdf/1502.03167v3.pdf
// "For convolutional layers, we additionally want the normalization to obey the convolutional property – so that
// different elements of the same feature map, at different locations, are normalized in the same way. To achieve
// this, we jointly normalize all the activations in a minibatch, over all locations."
INDArray mean, var;
if (training == TrainingMode.TRAIN) {
switch(x.rank()) {
case 2:
// mean and variance over samples in batch
mean = x.mean(0);
var = x.var(false, 0);
break;
case 4:
// mean and variance over samples AND locations
mean = x.mean(0, 2, 3);
var = x.var(false, 0, 2, 3);
break;
default:
throw new IllegalStateException("Batch normalization on activations of rank " + x.rank() + " not supported");
}
var.addi(layerConf.getEps());
} else {
// Global mean and variance estimate - used after training
mean = getParam(BatchNormalizationParamInitializer.GLOBAL_MEAN);
var = getParam(BatchNormalizationParamInitializer.GLOBAL_VAR);
}
std = Transforms.sqrt(var, true);
INDArray gamma = null;
INDArray beta = null;
INDArray globalMeanView = getParam(BatchNormalizationParamInitializer.GLOBAL_MEAN);
INDArray globalVarView = getParam(BatchNormalizationParamInitializer.GLOBAL_VAR);
if (layerConf.isLockGammaBeta()) {
if (helper != null && input.rank() == 4) {
//TODO: don't create these each iteration, when using cudnn
int[] gammaBetaShape = new int[] { 1, layerConf().getNOut() };
gamma = Nd4j.valueArrayOf(gammaBetaShape, layerConf().getGamma());
beta = Nd4j.valueArrayOf(gammaBetaShape, layerConf().getBeta());
}
} else {
gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
beta = getParam(BatchNormalizationParamInitializer.BETA);
}
if (helper != null && input.rank() != 4) {
//Note that cudnn does not support dense (2d) batch norm case as of v5.1
double decay = layerConf.getDecay();
INDArray ret = helper.preOutput(x, training == TrainingMode.TRAIN, shape, gamma, beta, globalMeanView, globalVarView, decay, layerConf.getEps());
if (ret != null) {
return ret;
}
}
// BN(xk) = gamma*xˆ + β (applying gamma and beta for each activation)
if (x.rank() == 2) {
xMu = x.subRowVector(mean);
xHat = xMu.divRowVector(std);
if (layerConf.isLockGammaBeta()) {
//Special case: gamma/beta have fixed values for all outputs
//Use mul/addi(Number) here to avoid allocating temp arrays of all same value
double g = layerConf.getGamma();
double b = layerConf.getBeta();
if (g != 1.0 && b != 0.0) {
//Default and most common case: 1.0 and 0.0 for these parameters. No point executing 1 * x + 0 op
activations = xHat.mul(g).addi(b);
} else {
activations = xHat;
}
} else {
//Standard case: gamma and beta are learned per parameter
activations = xHat.mulRowVector(gamma).addiRowVector(beta);
}
} else if (x.rank() == 4) {
if (!Shape.strideDescendingCAscendingF(x))
//TODO: temp Workaround for broadcast bug. To be removed when fixed
x = x.dup();
xMu = Nd4j.getExecutioner().execAndReturn(new BroadcastSubOp(x, mean, Nd4j.createUninitialized(x.shape(), x.ordering()), 1));
xHat = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(xMu, std, Nd4j.createUninitialized(x.shape(), x.ordering()), 1));
if (layerConf.isLockGammaBeta()) {
//Special case: gamma/beta have fixed values for all outputs
//Use mul/addi(Number) here to avoid allocating temp arrays of all same value
double g = layerConf.getGamma();
double b = layerConf.getBeta();
if (g != 1.0 && b != 0.0) {
//Default and most common case: 1.0 and 0.0 for these parameters. No point executing 1 * x + 0 op
activations = xHat.mul(g).addi(b);
} else {
activations = xHat;
}
} else {
//Standard case: gamma and beta are learned per parameter
activations = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xHat, gamma, Nd4j.createUninitialized(x.shape(), x.ordering()), 1));
activations = Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(activations, beta, activations, 1));
}
} else {
// TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
}
// store mean and var if using batch mean while training
double decay;
if (training == TrainingMode.TRAIN) {
if (layerConf.isMinibatch()) {
//Standard case: Estimate global mean and variance stats by moving average
//globalMean = decay * globalMean + (1-decay) * minibatchMean
//globalVar = decay * globalVar + (1-decay) * minibatchVar
//Note that it's safe to do a muli on 'mean' and 'var' variables: can't be the global arrays with training == Trainingmode.TRAIN
decay = layerConf.getDecay();
globalMeanView.muli(decay).addi(mean.muli(1 - decay));
globalVarView.muli(decay).addi(var.muli(1 - decay));
} else {
//Special case: doing full-batch (entire data set) training (uncommon; only tiny data sets)
//In this case, minibatch and global stats are identical. Don't want to use a moving average estimate.
globalMeanView.assign(mean);
globalVarView.assign(var);
}
}
return activations;
}
use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.
the class BatchNormalization method backpropGradient.
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
INDArray nextEpsilon;
int[] shape = getShape(epsilon);
// number examples in batch
int batchSize = epsilon.size(0);
org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
INDArray gamma = null;
INDArray dGammaView;
INDArray dBetaView;
INDArray dGlobalMeanView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_MEAN);
INDArray dGlobalVarView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_VAR);
if (layerConf.isLockGammaBeta()) {
int[] tempShape = new int[] { 1, shape[1] };
dGammaView = Nd4j.createUninitialized(tempShape, 'c');
dBetaView = Nd4j.createUninitialized(tempShape, 'c');
} else {
gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
dGammaView = gradientViews.get(BatchNormalizationParamInitializer.GAMMA);
dBetaView = gradientViews.get(BatchNormalizationParamInitializer.BETA);
}
Gradient retGradient = new DefaultGradient();
if (helper != null && epsilon.rank() == 4) {
//Note that cudnn does not support dense (2d) batch norm case as of v5.1
if (layerConf.isLockGammaBeta()) {
gamma = Nd4j.valueArrayOf(new int[] { 1, shape[1] }, layerConf.getGamma());
}
Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, shape, gamma, dGammaView, dBetaView, layerConf.getEps());
if (ret != null) {
return ret;
}
}
if (epsilon.rank() == 2) {
//TODO: handle fixed beta/gamma case...
//dL/dGamma = sum_examples dL/dOut .* xHat
INDArray dGamma = epsilon.mul(xHat).sum(0);
//dL/dBeta = sum_examples dL/dOut
INDArray dBeta = epsilon.sum(0);
INDArray dxhat;
if (layerConf.isLockGammaBeta()) {
dxhat = epsilon.mul(layerConf.getGamma());
} else {
//Standard case
//dL/dxHat = dL/dOut . gamma Shape: [minibatchSize, nOut]
dxhat = epsilon.mulRowVector(gamma);
}
//dL/dVariance
//Shape: [1, miniBatch]
INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
//dL/dmu
INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
//Shape: [1, nOut]
INDArray dLdmu = dxmu1.addi(dxmu2);
//Note the array reuse here: dxhat, xMu, dLdVar, dLdmu - all are invalid after this line (but aren't used later anyway)
INDArray dLdx = dxhat.diviRowVector(std).addi(xMu.muliRowVector(dLdVar.muli(2.0 / batchSize))).addiRowVector(dLdmu.muli(1.0 / batchSize));
//TODO rework this to avoid the assign here
dGammaView.assign(dGamma);
dBetaView.assign(dBeta);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
//TODO: do this properly
dGlobalMeanView.assign(0);
dGlobalVarView.assign(0);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
nextEpsilon = dLdx;
} else if (epsilon.rank() == 4) {
INDArray dGamma = epsilon.mul(xHat).sum(0, 2, 3);
INDArray dBeta = epsilon.sum(0, 2, 3);
INDArray dxhat;
if (layerConf.isLockGammaBeta()) {
dxhat = epsilon.mul(layerConf.getGamma());
} else {
//Standard case
dxhat = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(epsilon, gamma, Nd4j.createUninitialized(epsilon.shape(), epsilon.ordering()), 1));
}
//dL/dVariance
INDArray dLdVar = dxhat.mul(xMu).sum(0, 2, 3).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
//dL/dmu
int effectiveBatchSize = input.size(0) * input.size(2) * input.size(3);
INDArray dxmu1 = dxhat.sum(0, 2, 3).divi(std).negi();
INDArray dxmu2 = xMu.sum(0, 2, 3).muli(-2.0 / effectiveBatchSize).muli(dLdVar);
INDArray dLdmu = dxmu1.addi(dxmu2);
INDArray dLdx = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(dxhat, std, dxhat, 1)).addi(Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xMu, dLdVar.muli(2.0 / effectiveBatchSize), xMu, 1)));
Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(dLdx, dLdmu.muli(1.0 / effectiveBatchSize), dLdx, 1));
//TODO rework this to avoid the assign here
dGammaView.assign(dGamma);
dBetaView.assign(dBeta);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
//TODO: do this properly
dGlobalMeanView.assign(0);
dGlobalVarView.assign(0);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
nextEpsilon = dLdx;
} else {
// TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
}
return new Pair<>(retGradient, nextEpsilon);
}
use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.
the class GlobalPoolingLayer method epsilonHelperFullArray.
private INDArray epsilonHelperFullArray(INDArray inputArray, INDArray epsilon, int[] poolDim) {
//Broadcast: occurs on the remaining dimensions, after the pool dimensions have been removed.
//TODO find a more efficient way to do this
int[] broadcastDims = new int[inputArray.rank() - poolDim.length];
int count = 0;
for (int i = 0; i < inputArray.rank(); i++) {
if (ArrayUtils.contains(poolDim, i))
continue;
broadcastDims[count++] = i;
}
switch(poolingType) {
case MAX:
INDArray isMax = Nd4j.getExecutioner().execAndReturn(new IsMax(inputArray.dup(), poolDim));
return Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(isMax, epsilon, isMax, broadcastDims));
case AVG:
//if out = avg(in,dims) then dL/dIn = 1/N * dL/dOut
int n = 1;
for (int d : poolDim) {
n *= inputArray.size(d);
}
INDArray ret = Nd4j.create(inputArray.shape());
Nd4j.getExecutioner().exec(new BroadcastCopyOp(ret, epsilon, ret, broadcastDims));
ret.divi(n);
return ret;
case SUM:
INDArray retSum = Nd4j.create(inputArray.shape());
Nd4j.getExecutioner().exec(new BroadcastCopyOp(retSum, epsilon, retSum, broadcastDims));
return retSum;
case PNORM:
int pnorm = layerConf().getPnorm();
//First: do forward pass to get pNorm array
INDArray abs = Transforms.abs(inputArray, true);
Transforms.pow(abs, pnorm, false);
INDArray pNorm = Transforms.pow(abs.sum(poolDim), 1.0 / pnorm);
//dL/dIn = dL/dOut * dOut/dIn
//dOut/dIn = in .* |in|^(p-2) / ||in||_p^(p-1), where ||in||_p is the output p-norm
INDArray numerator;
if (pnorm == 2) {
numerator = inputArray.dup();
} else {
INDArray absp2 = Transforms.pow(Transforms.abs(inputArray, true), pnorm - 2, false);
numerator = inputArray.mul(absp2);
}
INDArray denom = Transforms.pow(pNorm, pnorm - 1, false);
denom.rdivi(epsilon);
Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(numerator, denom, numerator, broadcastDims));
return numerator;
default:
throw new RuntimeException("Unknown or not supported pooling type: " + poolingType);
}
}
use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.
the class L2Vertex method doBackward.
@Override
public Pair<Gradient, INDArray[]> doBackward(boolean tbptt) {
if (!canDoBackward())
throw new IllegalStateException("Cannot do backward pass: error not set");
INDArray a = inputs[0];
INDArray b = inputs[1];
INDArray out = doForward(tbptt);
// in case of 0
Transforms.max(out, eps, false);
//dL/dlambda aka 'epsilon' - from layer above
INDArray dLdlambda = epsilon;
//s^(-1/2) = 1.0 / s^(1/2) = 1.0 / out
INDArray sNegHalf = out.rdiv(1.0);
INDArray diff = a.sub(b);
//Column vector for all cases
INDArray first = dLdlambda.mul(sNegHalf);
INDArray dLda;
INDArray dLdb;
if (a.rank() == 2) {
//2d case (MLPs etc)
dLda = diff.muliColumnVector(first);
dLdb = dLda.neg();
} else {
//RNN and CNN case - Broadcast along dimension 0
dLda = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(diff, first, diff, 0));
dLdb = dLda.neg();
}
return new Pair<>(null, new INDArray[] { dLda, dLdb });
}
use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project nd4j by deeplearning4j.
the class NativeOpExecutionerTest method testBroadcastMultiDim.
@Test
public void testBroadcastMultiDim() throws Exception {
// Broadcast 1d: OK
INDArray arr2d = Nd4j.ones(2, 3);
INDArray toBCRow = Nd4j.create(new double[] { 1, 0, 0 });
Nd4j.getExecutioner().exec(new BroadcastMulOp(arr2d, toBCRow, arr2d, 1));
INDArray exp2d = Nd4j.create(new double[][] { { 1, 0, 0 }, { 1, 0, 0 } });
assertEquals(exp2d, arr2d);
// Broadcast 2d on 3d:
INDArray arr3d = Nd4j.ones(2, 3, 5);
INDArray bc2d = Nd4j.create(new double[][] { { 1, 1, 1, 1, 1 }, { 1, 1, 1, 0, 0 } });
bc2d.get(NDArrayIndex.point(1), NDArrayIndex.interval(3, 5)).assign(0);
Nd4j.getExecutioner().exec(new BroadcastMulOp(arr3d, bc2d, arr3d, 0, 2));
INDArray exp3d = Nd4j.ones(2, 3, 5);
exp3d.get(NDArrayIndex.point(1), NDArrayIndex.all(), NDArrayIndex.interval(3, 5)).assign(0);
for (int i = 0; i < 2; i++) {
System.out.println("Arr - " + i);
System.out.println(arr3d.get(NDArrayIndex.point(i), NDArrayIndex.all(), NDArrayIndex.all()));
System.out.println("Exp - " + i);
System.out.println(exp3d.get(NDArrayIndex.point(i), NDArrayIndex.all(), NDArrayIndex.all()));
System.out.println();
}
assertEquals(exp3d, arr3d);
}
Aggregations