Search in sources :

Example 6 with BroadcastMulOp

use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.

the class BatchNormalization method preOutput.

public INDArray preOutput(INDArray x, TrainingMode training) {
    INDArray activations;
    // TODO add this directly in layer or get the layer prior...
    // batchnorm true but need to clarify if activation before or after
    org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
    int[] shape = getShape(x);
    // xHat = (x-xmean) / sqrt(var + epsilon)
    //Note that for CNNs, mean and variance are calculated per feature map (i.e., per activation) rather than per activation
    //Pg5 of http://arxiv.org/pdf/1502.03167v3.pdf
    // "For convolutional layers, we additionally want the normalization to obey the convolutional property – so that
    //  different elements of the same feature map, at different locations, are normalized in the same way. To achieve
    //  this, we jointly normalize all the activations in a minibatch, over all locations."
    INDArray mean, var;
    if (training == TrainingMode.TRAIN) {
        switch(x.rank()) {
            case 2:
                // mean and variance over samples in batch
                mean = x.mean(0);
                var = x.var(false, 0);
                break;
            case 4:
                // mean and variance over samples AND locations
                mean = x.mean(0, 2, 3);
                var = x.var(false, 0, 2, 3);
                break;
            default:
                throw new IllegalStateException("Batch normalization on activations of rank " + x.rank() + " not supported");
        }
        var.addi(layerConf.getEps());
    } else {
        // Global mean and variance estimate - used after training
        mean = getParam(BatchNormalizationParamInitializer.GLOBAL_MEAN);
        var = getParam(BatchNormalizationParamInitializer.GLOBAL_VAR);
    }
    std = Transforms.sqrt(var, true);
    INDArray gamma = null;
    INDArray beta = null;
    INDArray globalMeanView = getParam(BatchNormalizationParamInitializer.GLOBAL_MEAN);
    INDArray globalVarView = getParam(BatchNormalizationParamInitializer.GLOBAL_VAR);
    if (layerConf.isLockGammaBeta()) {
        if (helper != null && input.rank() == 4) {
            //TODO: don't create these each iteration, when using cudnn
            int[] gammaBetaShape = new int[] { 1, layerConf().getNOut() };
            gamma = Nd4j.valueArrayOf(gammaBetaShape, layerConf().getGamma());
            beta = Nd4j.valueArrayOf(gammaBetaShape, layerConf().getBeta());
        }
    } else {
        gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
        beta = getParam(BatchNormalizationParamInitializer.BETA);
    }
    if (helper != null && input.rank() != 4) {
        //Note that cudnn does not support dense (2d) batch norm case as of v5.1
        double decay = layerConf.getDecay();
        INDArray ret = helper.preOutput(x, training == TrainingMode.TRAIN, shape, gamma, beta, globalMeanView, globalVarView, decay, layerConf.getEps());
        if (ret != null) {
            return ret;
        }
    }
    // BN(xk) = gamma*xˆ + β (applying gamma and beta for each activation)
    if (x.rank() == 2) {
        xMu = x.subRowVector(mean);
        xHat = xMu.divRowVector(std);
        if (layerConf.isLockGammaBeta()) {
            //Special case: gamma/beta have fixed values for all outputs
            //Use mul/addi(Number) here to avoid allocating temp arrays of all same value
            double g = layerConf.getGamma();
            double b = layerConf.getBeta();
            if (g != 1.0 && b != 0.0) {
                //Default and most common case: 1.0 and 0.0 for these parameters. No point executing 1 * x + 0 op
                activations = xHat.mul(g).addi(b);
            } else {
                activations = xHat;
            }
        } else {
            //Standard case: gamma and beta are learned per parameter
            activations = xHat.mulRowVector(gamma).addiRowVector(beta);
        }
    } else if (x.rank() == 4) {
        if (!Shape.strideDescendingCAscendingF(x))
            //TODO: temp Workaround for broadcast bug. To be removed when fixed
            x = x.dup();
        xMu = Nd4j.getExecutioner().execAndReturn(new BroadcastSubOp(x, mean, Nd4j.createUninitialized(x.shape(), x.ordering()), 1));
        xHat = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(xMu, std, Nd4j.createUninitialized(x.shape(), x.ordering()), 1));
        if (layerConf.isLockGammaBeta()) {
            //Special case: gamma/beta have fixed values for all outputs
            //Use mul/addi(Number) here to avoid allocating temp arrays of all same value
            double g = layerConf.getGamma();
            double b = layerConf.getBeta();
            if (g != 1.0 && b != 0.0) {
                //Default and most common case: 1.0 and 0.0 for these parameters. No point executing 1 * x + 0 op
                activations = xHat.mul(g).addi(b);
            } else {
                activations = xHat;
            }
        } else {
            //Standard case: gamma and beta are learned per parameter
            activations = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xHat, gamma, Nd4j.createUninitialized(x.shape(), x.ordering()), 1));
            activations = Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(activations, beta, activations, 1));
        }
    } else {
        // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
        throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
    }
    // store mean and var if using batch mean while training
    double decay;
    if (training == TrainingMode.TRAIN) {
        if (layerConf.isMinibatch()) {
            //Standard case: Estimate global mean and variance stats by moving average
            //globalMean = decay * globalMean + (1-decay) * minibatchMean
            //globalVar  = decay * globalVar  + (1-decay) * minibatchVar
            //Note that it's safe to do a muli on 'mean' and 'var' variables: can't be the global arrays with training == Trainingmode.TRAIN
            decay = layerConf.getDecay();
            globalMeanView.muli(decay).addi(mean.muli(1 - decay));
            globalVarView.muli(decay).addi(var.muli(1 - decay));
        } else {
            //Special case: doing full-batch (entire data set) training (uncommon; only tiny data sets)
            //In this case, minibatch and global stats are identical. Don't want to use a moving average estimate.
            globalMeanView.assign(mean);
            globalVarView.assign(var);
        }
    }
    return activations;
}
Also used : BroadcastAddOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp) INDArray(org.nd4j.linalg.api.ndarray.INDArray) BroadcastSubOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) BroadcastDivOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastDivOp)

Example 7 with BroadcastMulOp

use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.

the class BatchNormalization method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    INDArray nextEpsilon;
    int[] shape = getShape(epsilon);
    // number examples in batch
    int batchSize = epsilon.size(0);
    org.deeplearning4j.nn.conf.layers.BatchNormalization layerConf = layerConf();
    INDArray gamma = null;
    INDArray dGammaView;
    INDArray dBetaView;
    INDArray dGlobalMeanView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_MEAN);
    INDArray dGlobalVarView = gradientViews.get(BatchNormalizationParamInitializer.GLOBAL_VAR);
    if (layerConf.isLockGammaBeta()) {
        int[] tempShape = new int[] { 1, shape[1] };
        dGammaView = Nd4j.createUninitialized(tempShape, 'c');
        dBetaView = Nd4j.createUninitialized(tempShape, 'c');
    } else {
        gamma = getParam(BatchNormalizationParamInitializer.GAMMA);
        dGammaView = gradientViews.get(BatchNormalizationParamInitializer.GAMMA);
        dBetaView = gradientViews.get(BatchNormalizationParamInitializer.BETA);
    }
    Gradient retGradient = new DefaultGradient();
    if (helper != null && epsilon.rank() == 4) {
        //Note that cudnn does not support dense (2d) batch norm case as of v5.1
        if (layerConf.isLockGammaBeta()) {
            gamma = Nd4j.valueArrayOf(new int[] { 1, shape[1] }, layerConf.getGamma());
        }
        Pair<Gradient, INDArray> ret = helper.backpropGradient(input, epsilon, shape, gamma, dGammaView, dBetaView, layerConf.getEps());
        if (ret != null) {
            return ret;
        }
    }
    if (epsilon.rank() == 2) {
        //TODO: handle fixed beta/gamma case...
        //dL/dGamma = sum_examples dL/dOut .* xHat
        INDArray dGamma = epsilon.mul(xHat).sum(0);
        //dL/dBeta = sum_examples dL/dOut
        INDArray dBeta = epsilon.sum(0);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            //dL/dxHat = dL/dOut . gamma        Shape: [minibatchSize, nOut]
            dxhat = epsilon.mulRowVector(gamma);
        }
        //dL/dVariance
        //Shape: [1, miniBatch]
        INDArray dLdVar = dxhat.mul(xMu).sum(0).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        INDArray dxmu1 = dxhat.sum(0).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0).muli(-2.0 / batchSize).muli(dLdVar);
        //Shape: [1, nOut]
        INDArray dLdmu = dxmu1.addi(dxmu2);
        //Note the array reuse here: dxhat, xMu, dLdVar, dLdmu - all are invalid after this line (but aren't used later anyway)
        INDArray dLdx = dxhat.diviRowVector(std).addi(xMu.muliRowVector(dLdVar.muli(2.0 / batchSize))).addiRowVector(dLdmu.muli(1.0 / batchSize));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else if (epsilon.rank() == 4) {
        INDArray dGamma = epsilon.mul(xHat).sum(0, 2, 3);
        INDArray dBeta = epsilon.sum(0, 2, 3);
        INDArray dxhat;
        if (layerConf.isLockGammaBeta()) {
            dxhat = epsilon.mul(layerConf.getGamma());
        } else {
            //Standard case
            dxhat = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(epsilon, gamma, Nd4j.createUninitialized(epsilon.shape(), epsilon.ordering()), 1));
        }
        //dL/dVariance
        INDArray dLdVar = dxhat.mul(xMu).sum(0, 2, 3).muli(-0.5).muli(Transforms.pow(std, -3.0, true));
        //dL/dmu
        int effectiveBatchSize = input.size(0) * input.size(2) * input.size(3);
        INDArray dxmu1 = dxhat.sum(0, 2, 3).divi(std).negi();
        INDArray dxmu2 = xMu.sum(0, 2, 3).muli(-2.0 / effectiveBatchSize).muli(dLdVar);
        INDArray dLdmu = dxmu1.addi(dxmu2);
        INDArray dLdx = Nd4j.getExecutioner().execAndReturn(new BroadcastDivOp(dxhat, std, dxhat, 1)).addi(Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(xMu, dLdVar.muli(2.0 / effectiveBatchSize), xMu, 1)));
        Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(dLdx, dLdmu.muli(1.0 / effectiveBatchSize), dLdx, 1));
        //TODO rework this to avoid the assign here
        dGammaView.assign(dGamma);
        dBetaView.assign(dBeta);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GAMMA, dGammaView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
        //TODO: do this properly
        dGlobalMeanView.assign(0);
        dGlobalVarView.assign(0);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_MEAN, dGlobalMeanView);
        retGradient.setGradientFor(BatchNormalizationParamInitializer.GLOBAL_VAR, dGlobalVarView);
        nextEpsilon = dLdx;
    } else {
        // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
        throw new IllegalStateException("The layer prior to BatchNorm in the configuration is not currently supported.");
    }
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) BroadcastDivOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastDivOp) BroadcastAddOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 8 with BroadcastMulOp

use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.

the class GlobalPoolingLayer method epsilonHelperFullArray.

private INDArray epsilonHelperFullArray(INDArray inputArray, INDArray epsilon, int[] poolDim) {
    //Broadcast: occurs on the remaining dimensions, after the pool dimensions have been removed.
    //TODO find a more efficient way to do this
    int[] broadcastDims = new int[inputArray.rank() - poolDim.length];
    int count = 0;
    for (int i = 0; i < inputArray.rank(); i++) {
        if (ArrayUtils.contains(poolDim, i))
            continue;
        broadcastDims[count++] = i;
    }
    switch(poolingType) {
        case MAX:
            INDArray isMax = Nd4j.getExecutioner().execAndReturn(new IsMax(inputArray.dup(), poolDim));
            return Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(isMax, epsilon, isMax, broadcastDims));
        case AVG:
            //if out = avg(in,dims) then dL/dIn = 1/N * dL/dOut
            int n = 1;
            for (int d : poolDim) {
                n *= inputArray.size(d);
            }
            INDArray ret = Nd4j.create(inputArray.shape());
            Nd4j.getExecutioner().exec(new BroadcastCopyOp(ret, epsilon, ret, broadcastDims));
            ret.divi(n);
            return ret;
        case SUM:
            INDArray retSum = Nd4j.create(inputArray.shape());
            Nd4j.getExecutioner().exec(new BroadcastCopyOp(retSum, epsilon, retSum, broadcastDims));
            return retSum;
        case PNORM:
            int pnorm = layerConf().getPnorm();
            //First: do forward pass to get pNorm array
            INDArray abs = Transforms.abs(inputArray, true);
            Transforms.pow(abs, pnorm, false);
            INDArray pNorm = Transforms.pow(abs.sum(poolDim), 1.0 / pnorm);
            //dL/dIn = dL/dOut * dOut/dIn
            //dOut/dIn = in .* |in|^(p-2) /  ||in||_p^(p-1), where ||in||_p is the output p-norm
            INDArray numerator;
            if (pnorm == 2) {
                numerator = inputArray.dup();
            } else {
                INDArray absp2 = Transforms.pow(Transforms.abs(inputArray, true), pnorm - 2, false);
                numerator = inputArray.mul(absp2);
            }
            INDArray denom = Transforms.pow(pNorm, pnorm - 1, false);
            denom.rdivi(epsilon);
            Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(numerator, denom, numerator, broadcastDims));
            return numerator;
        default:
            throw new RuntimeException("Unknown or not supported pooling type: " + poolingType);
    }
}
Also used : IsMax(org.nd4j.linalg.api.ops.impl.transforms.IsMax) INDArray(org.nd4j.linalg.api.ndarray.INDArray) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) BroadcastCopyOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastCopyOp)

Example 9 with BroadcastMulOp

use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project deeplearning4j by deeplearning4j.

the class L2Vertex method doBackward.

@Override
public Pair<Gradient, INDArray[]> doBackward(boolean tbptt) {
    if (!canDoBackward())
        throw new IllegalStateException("Cannot do backward pass: error not set");
    INDArray a = inputs[0];
    INDArray b = inputs[1];
    INDArray out = doForward(tbptt);
    // in case of 0
    Transforms.max(out, eps, false);
    //dL/dlambda aka 'epsilon' - from layer above
    INDArray dLdlambda = epsilon;
    //s^(-1/2) = 1.0 / s^(1/2) = 1.0 / out
    INDArray sNegHalf = out.rdiv(1.0);
    INDArray diff = a.sub(b);
    //Column vector for all cases
    INDArray first = dLdlambda.mul(sNegHalf);
    INDArray dLda;
    INDArray dLdb;
    if (a.rank() == 2) {
        //2d case (MLPs etc)
        dLda = diff.muliColumnVector(first);
        dLdb = dLda.neg();
    } else {
        //RNN and CNN case - Broadcast along dimension 0
        dLda = Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(diff, first, diff, 0));
        dLdb = dLda.neg();
    }
    return new Pair<>(null, new INDArray[] { dLda, dLdb });
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) Pair(org.deeplearning4j.berkeley.Pair)

Example 10 with BroadcastMulOp

use of org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp in project nd4j by deeplearning4j.

the class NativeOpExecutionerTest method testBroadcastMultiDim.

@Test
public void testBroadcastMultiDim() throws Exception {
    // Broadcast 1d: OK
    INDArray arr2d = Nd4j.ones(2, 3);
    INDArray toBCRow = Nd4j.create(new double[] { 1, 0, 0 });
    Nd4j.getExecutioner().exec(new BroadcastMulOp(arr2d, toBCRow, arr2d, 1));
    INDArray exp2d = Nd4j.create(new double[][] { { 1, 0, 0 }, { 1, 0, 0 } });
    assertEquals(exp2d, arr2d);
    // Broadcast 2d on 3d:
    INDArray arr3d = Nd4j.ones(2, 3, 5);
    INDArray bc2d = Nd4j.create(new double[][] { { 1, 1, 1, 1, 1 }, { 1, 1, 1, 0, 0 } });
    bc2d.get(NDArrayIndex.point(1), NDArrayIndex.interval(3, 5)).assign(0);
    Nd4j.getExecutioner().exec(new BroadcastMulOp(arr3d, bc2d, arr3d, 0, 2));
    INDArray exp3d = Nd4j.ones(2, 3, 5);
    exp3d.get(NDArrayIndex.point(1), NDArrayIndex.all(), NDArrayIndex.interval(3, 5)).assign(0);
    for (int i = 0; i < 2; i++) {
        System.out.println("Arr - " + i);
        System.out.println(arr3d.get(NDArrayIndex.point(i), NDArrayIndex.all(), NDArrayIndex.all()));
        System.out.println("Exp - " + i);
        System.out.println(exp3d.get(NDArrayIndex.point(i), NDArrayIndex.all(), NDArrayIndex.all()));
        System.out.println();
    }
    assertEquals(exp3d, arr3d);
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) BroadcastMulOp(org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp) Test(org.junit.Test)

Aggregations

BroadcastMulOp (org.nd4j.linalg.api.ops.impl.broadcast.BroadcastMulOp)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)16 BroadcastAddOp (org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp)9 Test (org.junit.Test)7 BroadcastDivOp (org.nd4j.linalg.api.ops.impl.broadcast.BroadcastDivOp)6 MultiLayerConfiguration (org.deeplearning4j.nn.conf.MultiLayerConfiguration)4 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)4 Pair (org.deeplearning4j.berkeley.Pair)3 BroadcastCopyOp (org.nd4j.linalg.api.ops.impl.broadcast.BroadcastCopyOp)3 IsMax (org.nd4j.linalg.api.ops.impl.transforms.IsMax)3 Gradient (org.deeplearning4j.nn.gradient.Gradient)2 BroadcastSubOp (org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp)2 Layer (org.deeplearning4j.nn.api.Layer)1 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)1 BaseNd4jTest (org.nd4j.linalg.BaseNd4jTest)1