Search in sources :

Example 1 with ActivationSigmoid

use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.

the class LSTMHelpers method backpropGradientHelper.

public static Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final IActivation gateActivationFn, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray input, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray recurrentWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
final INDArray inputWeights, final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength, final FwdPassReturn fwdPass, final boolean forwards, final String inputWeightKey, final String recurrentWeightKey, final String biasWeightKey, //Input mask: should only be used with bidirectional RNNs + variable length
final Map<String, INDArray> gradientViews, //Input mask: should only be used with bidirectional RNNs + variable length
INDArray maskArray) {
    //Expect errors to have shape: [miniBatchSize,n^(L+1),timeSeriesLength]
    //i.e., n^L
    int hiddenLayerSize = recurrentWeights.size(0);
    //n^(L-1)
    int prevLayerSize = inputWeights.size(0);
    int miniBatchSize = epsilon.size(0);
    //Edge case: T=1 may have shape [miniBatchSize,n^(L+1)], equiv. to [miniBatchSize,n^(L+1),1]
    boolean is2dInput = epsilon.rank() < 3;
    int timeSeriesLength = (is2dInput ? 1 : epsilon.size(2));
    INDArray wFFTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize)).transpose();
    INDArray wOOTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize + 1)).transpose();
    INDArray wGGTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize + 2)).transpose();
    INDArray wIFOG = recurrentWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize));
    //F order here so that content for time steps are together
    //i.e., what would be W^L*(delta^L)^T. Shape: [m,n^(L-1),T]
    INDArray epsilonNext = Nd4j.create(new int[] { miniBatchSize, prevLayerSize, timeSeriesLength }, 'f');
    INDArray nablaCellStateNext = null;
    INDArray deltaifogNext = Nd4j.create(new int[] { miniBatchSize, 4 * hiddenLayerSize }, 'f');
    INDArray deltaiNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
    INDArray deltafNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(hiddenLayerSize, 2 * hiddenLayerSize));
    INDArray deltaoNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 3 * hiddenLayerSize));
    INDArray deltagNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(3 * hiddenLayerSize, 4 * hiddenLayerSize));
    Level1 l1BLAS = Nd4j.getBlasWrapper().level1();
    int endIdx = 0;
    if (truncatedBPTT) {
        endIdx = Math.max(0, timeSeriesLength - tbpttBackwardLength);
    }
    //Get gradients. Note that we have to manually zero these, as they might not be initialized (or still has data from last iteration)
    //Also note that they are in f order (as per param initializer) so can be used in gemm etc
    INDArray iwGradientsOut = gradientViews.get(inputWeightKey);
    //Order: {I,F,O,G,FF,OO,GG}
    INDArray rwGradientsOut = gradientViews.get(recurrentWeightKey);
    INDArray bGradientsOut = gradientViews.get(biasWeightKey);
    iwGradientsOut.assign(0);
    rwGradientsOut.assign(0);
    bGradientsOut.assign(0);
    INDArray rwGradientsIFOG = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize));
    INDArray rwGradientsFF = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize));
    INDArray rwGradientsOO = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize + 1));
    INDArray rwGradientsGG = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize + 2));
    boolean sigmoidGates = gateActivationFn instanceof ActivationSigmoid;
    IActivation afn = conf.getLayer().getActivationFn();
    INDArray timeStepMaskColumn = null;
    for (int iTimeIndex = timeSeriesLength - 1; iTimeIndex >= endIdx; iTimeIndex--) {
        int time = iTimeIndex;
        int inext = 1;
        if (!forwards) {
            time = timeSeriesLength - iTimeIndex - 1;
            inext = -1;
        }
        //First: calclate the components of nablaCellState that relies on the next time step deltas, so we can overwrite the deltas
        INDArray nablaCellState;
        if (iTimeIndex != timeSeriesLength - 1) {
            nablaCellState = deltafNext.dup('f').muliRowVector(wFFTranspose);
            l1BLAS.axpy(nablaCellState.length(), 1.0, deltagNext.dup('f').muliRowVector(wGGTranspose), nablaCellState);
        } else {
            nablaCellState = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize }, 'f');
        }
        INDArray prevMemCellState = (iTimeIndex == 0 ? null : fwdPass.memCellState[time - inext]);
        INDArray prevHiddenUnitActivation = (iTimeIndex == 0 ? null : fwdPass.fwdPassOutputAsArrays[time - inext]);
        INDArray currMemCellState = fwdPass.memCellState[time];
        //LSTM unit output errors (dL/d(a_out)); not to be confused with \delta=dL/d(z_out)
        //(w^{L+1}*(delta^{(L+1)t})^T)^T or equiv.
        INDArray epsilonSlice = (is2dInput ? epsilon : epsilon.tensorAlongDimension(time, 1, 0));
        //Shape: [m,n^L]
        INDArray nablaOut = Shape.toOffsetZeroCopy(epsilonSlice, 'f');
        if (iTimeIndex != timeSeriesLength - 1) {
            //if t == timeSeriesLength-1 then deltaiNext etc are zeros
            Nd4j.gemm(deltaifogNext, wIFOG, nablaOut, false, true, 1.0, 1.0);
        }
        //Output gate deltas:
        INDArray sigmahOfS = fwdPass.memCellActivations[time];
        INDArray ao = fwdPass.oa[time];
        //Normally would use zo.dup() in above line, but won't be using zo again (for this time step). Ditto for zf, zg, zi
        INDArray deltao = deltaoNext;
        Nd4j.getExecutioner().exec(new MulOp(nablaOut, sigmahOfS, deltao));
        if (sigmoidGates) {
            //Equivalent to sigmoid deriv on zo
            INDArray sigmaoPrimeOfZo = Nd4j.getExecutioner().execAndReturn(new TimesOneMinus(ao.dup('f')));
            deltao.muli(sigmaoPrimeOfZo);
        } else {
            //Deltao needs to be modified in-place
            deltao.assign(gateActivationFn.backprop(fwdPass.oz[time], deltao).getFirst());
        //TODO: optimize (no assign)
        }
        //Memory cell error:
        //TODO activation functions with params
        INDArray temp = afn.backprop(currMemCellState.dup('f'), ao.muli(nablaOut)).getFirst();
        l1BLAS.axpy(nablaCellState.length(), 1.0, temp, nablaCellState);
        INDArray deltaMulRowWOO = deltao.dup('f').muliRowVector(wOOTranspose);
        //nablaCellState.addi(deltao.mulRowVector(wOOTranspose));
        l1BLAS.axpy(nablaCellState.length(), 1.0, deltaMulRowWOO, nablaCellState);
        if (iTimeIndex != timeSeriesLength - 1) {
            INDArray nextForgetGateAs = fwdPass.fa[time + inext];
            int length = nablaCellState.length();
            //nablaCellState.addi(nextForgetGateAs.mul(nablaCellStateNext))
            l1BLAS.axpy(length, 1.0, nextForgetGateAs.muli(nablaCellStateNext), nablaCellState);
        }
        //Store for use in next iteration
        nablaCellStateNext = nablaCellState;
        //Forget gate delta:
        INDArray af = fwdPass.fa[time];
        INDArray deltaf = null;
        if (iTimeIndex > 0) {
            deltaf = deltafNext;
            if (sigmoidGates) {
                Nd4j.getExecutioner().exec(new TimesOneMinus(af, deltaf));
                deltaf.muli(nablaCellState);
                deltaf.muli(prevMemCellState);
            } else {
                INDArray temp2 = nablaCellState.mul(prevMemCellState);
                //deltaf needs to be modified in-place
                deltaf.assign(gateActivationFn.backprop(fwdPass.fz[time].dup('f'), temp2).getFirst());
            //TODO activation functions with params
            }
        }
        //Shape: [m,n^L]
        //Input modulation gate delta:
        INDArray ag = fwdPass.ga[time];
        INDArray ai = fwdPass.ia[time];
        INDArray deltag = deltagNext;
        if (sigmoidGates) {
            //Equivalent to sigmoid deriv on zg
            Nd4j.getExecutioner().exec(new TimesOneMinus(ag, deltag));
            deltag.muli(ai);
            deltag.muli(nablaCellState);
        } else {
            INDArray temp2 = Nd4j.getExecutioner().execAndReturn(new MulOp(ai, nablaCellState, Nd4j.createUninitialized(ai.shape(), 'f')));
            deltag.assign(gateActivationFn.backprop(fwdPass.gz[time], temp2).getFirst());
        //TODO activation functions with params; optimize (no assign)
        }
        //Shape: [m,n^L]
        //Network input delta:
        INDArray zi = fwdPass.iz[time];
        INDArray deltai = deltaiNext;
        temp = Nd4j.getExecutioner().execAndReturn(new MulOp(ag, nablaCellState, Nd4j.createUninitialized(deltai.shape(), 'f')));
        deltai.assign(afn.backprop(zi, temp).getFirst());
        //Handle masking
        if (maskArray != null) {
            //Mask array is present: bidirectional RNN -> need to zero out these errors to avoid using errors from a masked time step
            // to calculate the parameter gradients.  Mask array has shape [minibatch, timeSeriesLength] -> get column(this time step)
            timeStepMaskColumn = maskArray.getColumn(time);
            deltaifogNext.muliColumnVector(timeStepMaskColumn);
        //Later, the deltaifogNext is used to calculate: input weight gradients, recurrent weight gradients, bias gradients
        }
        INDArray prevLayerActivationSlice = Shape.toMmulCompatible(is2dInput ? input : input.tensorAlongDimension(time, 1, 0));
        if (iTimeIndex > 0) {
            //Again, deltaifog_current == deltaifogNext at this point... same array
            Nd4j.gemm(prevLayerActivationSlice, deltaifogNext, iwGradientsOut, true, false, 1.0, 1.0);
        } else {
            INDArray iwGradients_i = iwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
            Nd4j.gemm(prevLayerActivationSlice, deltai, iwGradients_i, true, false, 1.0, 1.0);
            INDArray iwGradients_og = iwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            INDArray deltaog = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            Nd4j.gemm(prevLayerActivationSlice, deltaog, iwGradients_og, true, false, 1.0, 1.0);
        }
        if (iTimeIndex > 0) {
            //If t==0, then prevHiddenUnitActivation==zeros(n^L,n^L), so dL/dW for recurrent weights will end up as 0 anyway
            //At this point: deltaifog and deltaifogNext are the same thing...
            //So what we are actually doing here is sum of (prevAct^transpose * deltaifog_current)
            Nd4j.gemm(prevHiddenUnitActivation, deltaifogNext, rwGradientsIFOG, true, false, 1.0, 1.0);
            //Shape: [1,n^L]. sum(0) is sum over examples in mini-batch.
            //Can use axpy here because result of sum and rwGradients[4 to 6] have order Nd4j.order(), via Nd4j.create()
            //mul not mmul because these weights are from unit j->j only (whereas other recurrent weights are i->j for all i,j)
            INDArray dLdwFF = deltaf.dup('f').muli(prevMemCellState).sum(0);
            //rwGradients[4].addi(dLdwFF);    //dL/dw_{FF}
            l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwFF, rwGradientsFF);
            INDArray dLdwGG = deltag.dup('f').muli(prevMemCellState).sum(0);
            //rwGradients[6].addi(dLdwGG);
            l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwGG, rwGradientsGG);
        }
        //Expected shape: [n^L,1]. sum(0) is sum over examples in mini-batch.
        INDArray dLdwOO = deltao.dup('f').muli(currMemCellState).sum(0);
        //rwGradients[5].addi(dLdwOO);    //dL/dw_{OOxy}
        l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwOO, rwGradientsOO);
        if (iTimeIndex > 0) {
            l1BLAS.axpy(4 * hiddenLayerSize, 1.0, deltaifogNext.sum(0), bGradientsOut);
        } else {
            //Sneaky way to do bGradients_i += deltai.sum(0)
            l1BLAS.axpy(hiddenLayerSize, 1.0, deltai.sum(0), bGradientsOut);
            INDArray ogBiasToAdd = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize)).sum(0);
            INDArray ogBiasGrad = bGradientsOut.get(NDArrayIndex.point(0), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            l1BLAS.axpy(2 * hiddenLayerSize, 1.0, ogBiasToAdd, ogBiasGrad);
        }
        //Calculate epsilonNext - i.e., equiv. to what would be (w^L*(d^(Lt))^T)^T in a normal network
        //But here, need to add 4 weights * deltas for the IFOG gates
        //This slice: f order and contiguous, due to epsilonNext being defined as f order.
        INDArray epsilonNextSlice = epsilonNext.tensorAlongDimension(time, 1, 0);
        if (iTimeIndex > 0) {
            Nd4j.gemm(deltaifogNext, inputWeights, epsilonNextSlice, false, true, 1.0, 1.0);
        } else {
            //No contribution from forget gate at t=0
            INDArray wi = inputWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
            Nd4j.gemm(deltai, wi, epsilonNextSlice, false, true, 1.0, 1.0);
            INDArray deltaog = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            INDArray wog = inputWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            //epsilonNextSlice.addi(deltao.mmul(woTranspose)).addi(deltag.mmul(wgTranspose));
            Nd4j.gemm(deltaog, wog, epsilonNextSlice, false, true, 1.0, 1.0);
        }
        if (maskArray != null) {
            //Mask array is present: bidirectional RNN -> need to zero out these errors to avoid sending anything
            // but 0s to the layer below at this time step (for the given example)
            epsilonNextSlice.muliColumnVector(timeStepMaskColumn);
        }
    }
    Gradient retGradient = new DefaultGradient();
    retGradient.gradientForVariable().put(inputWeightKey, iwGradientsOut);
    retGradient.gradientForVariable().put(recurrentWeightKey, rwGradientsOut);
    retGradient.gradientForVariable().put(biasWeightKey, bGradientsOut);
    return new Pair<>(retGradient, epsilonNext);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) TimesOneMinus(org.nd4j.linalg.api.ops.impl.transforms.TimesOneMinus) MulOp(org.nd4j.linalg.api.ops.impl.transforms.arithmetic.MulOp) ActivationSigmoid(org.nd4j.linalg.activations.impl.ActivationSigmoid) Level1(org.nd4j.linalg.api.blas.Level1) IActivation(org.nd4j.linalg.activations.IActivation) NDArrayIndex.point(org.nd4j.linalg.indexing.NDArrayIndex.point) Pair(org.deeplearning4j.berkeley.Pair)

Example 2 with ActivationSigmoid

use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.

the class GravesBidirectionalLSTMTest method testGravesBidirectionalLSTMForwardPassHelper.

@Test
public void testGravesBidirectionalLSTMForwardPassHelper() throws Exception {
    //GravesBidirectionalLSTM.activateHelper() has different behaviour (due to optimizations) when forBackprop==true vs false
    //But should otherwise provide identical activations
    Nd4j.getRandom().setSeed(12345);
    final int nIn = 10;
    final int layerSize = 15;
    final int miniBatchSize = 4;
    final int timeSeriesLength = 7;
    final NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder().nIn(nIn).nOut(layerSize).weightInit(WeightInit.DISTRIBUTION).dist(new UniformDistribution(0, 1)).activation(Activation.TANH).build()).build();
    int numParams = conf.getLayer().initializer().numParams(conf);
    INDArray params = Nd4j.create(1, numParams);
    final GravesBidirectionalLSTM lstm = (GravesBidirectionalLSTM) conf.getLayer().instantiate(conf, null, 0, params, true);
    final INDArray input = Nd4j.rand(new int[] { miniBatchSize, nIn, timeSeriesLength });
    lstm.setInput(input);
    final INDArray fwdPassFalse = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(), lstm.input(), lstm.getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null, false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null).fwdPassOutput;
    final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(), lstm.input(), lstm.getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null, true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null).fwdPassOutputAsArrays;
    //I have no idea what the heck this does --Ben
    for (int i = 0; i < timeSeriesLength; i++) {
        final INDArray sliceFalse = fwdPassFalse.tensorAlongDimension(i, 1, 0);
        final INDArray sliceTrue = fwdPassTrue[i];
        assertTrue(sliceFalse.equals(sliceTrue));
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) UniformDistribution(org.deeplearning4j.nn.conf.distribution.UniformDistribution) ActivationSigmoid(org.nd4j.linalg.activations.impl.ActivationSigmoid) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) Test(org.junit.Test)

Example 3 with ActivationSigmoid

use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.

the class RBMTests method testComputeGradientAndScore.

@Test
public void testComputeGradientAndScore() {
    INDArray input = Nd4j.linspace(1, 10, 10);
    INDArray params = getStandardParams(10, 5);
    RBM rbm = getRBMLayer(10, 5, HiddenUnit.BINARY, VisibleUnit.BINARY, params, true, false, 1, LossFunctions.LossFunction.MSE);
    rbm.setInput(input);
    rbm.computeGradientAndScore();
    Pair<Gradient, Double> pair = rbm.gradientAndScore();
    INDArray hprob = sigmoid(input.mmul(rbm.getParam(PretrainParamInitializer.WEIGHT_KEY)).addiRowVector(rbm.getParam(PretrainParamInitializer.BIAS_KEY)));
    INDArray vprob = sigmoid(hprob.mmul(rbm.getParam(PretrainParamInitializer.WEIGHT_KEY).transpose()).addiRowVector(rbm.getParam(PretrainParamInitializer.VISIBLE_BIAS_KEY)));
    Distribution dist = Nd4j.getDistributions().createBinomial(1, vprob);
    dist.reseedRandomGenerator(42);
    INDArray vSample = dist.sample(vprob.shape());
    //double expectedScore = LossFunctions.LossFunction.MSE.getILossFunction().computeScore(input, vSample, "sigmoid", null, false);
    double expectedScore = LossFunctions.LossFunction.MSE.getILossFunction().computeScore(input, vSample, new ActivationSigmoid(), null, false);
    assertEquals(expectedScore, pair.getSecond(), 1e-8);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Distribution(org.nd4j.linalg.api.rng.distribution.Distribution) NormalDistribution(org.deeplearning4j.nn.conf.distribution.NormalDistribution) ActivationSigmoid(org.nd4j.linalg.activations.impl.ActivationSigmoid) Test(org.junit.Test)

Example 4 with ActivationSigmoid

use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.

the class LSTMHelpers method activateHelper.

/**
     * Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
     * and rnnTimeStep whilst being reasonably efficient for all
     */
public static FwdPassReturn activateHelper(final Layer layer, final NeuralNetConfiguration conf, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
final IActivation gateActivationFn, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray input, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray recurrentWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
final INDArray originalInputWeights, //Shape: [4,hiddenLayerSize]; order: [bi,bf,bo,bg]^T
final INDArray biases, final boolean training, final INDArray originalPrevOutputActivations, final INDArray originalPrevMemCellState, boolean forBackprop, boolean forwards, //Input mask: should only be used with bidirectional RNNs + variable length
final String inputWeightKey, //Input mask: should only be used with bidirectional RNNs + variable length
INDArray maskArray) {
    //Data has shape [m,nIn,T]. Layer activations/output has shape [m,nHiddenUnits,T]
    if (input == null || input.length() == 0)
        throw new IllegalArgumentException("Invalid input: not set or 0 length");
    INDArray inputWeights = originalInputWeights;
    INDArray prevOutputActivations = originalPrevOutputActivations;
    //Edge case of T=1, may have shape [m,nIn], equiv. to [m,nIn,1]
    boolean is2dInput = input.rank() < 3;
    int timeSeriesLength = (is2dInput ? 1 : input.size(2));
    int hiddenLayerSize = recurrentWeights.size(0);
    int miniBatchSize = input.size(0);
    INDArray prevMemCellState;
    if (originalPrevMemCellState == null) {
        prevMemCellState = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize }, 'f');
    } else {
        prevMemCellState = originalPrevMemCellState.dup('f');
    }
    INDArray recurrentWeightsIFOG = recurrentWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize)).dup('f');
    //Apply dropconnect to input (not recurrent) weights only:
    if (conf.isUseDropConnect() && training && conf.getLayer().getDropOut() > 0) {
        inputWeights = Dropout.applyDropConnect(layer, inputWeightKey);
    }
    INDArray wFFTranspose = recurrentWeights.get(NDArrayIndex.all(), interval(4 * hiddenLayerSize, 4 * hiddenLayerSize + 1)).transpose();
    INDArray wOOTranspose = recurrentWeights.get(NDArrayIndex.all(), interval(4 * hiddenLayerSize + 1, 4 * hiddenLayerSize + 2)).transpose();
    INDArray wGGTranspose = recurrentWeights.get(NDArrayIndex.all(), interval(4 * hiddenLayerSize + 2, 4 * hiddenLayerSize + 3)).transpose();
    if (timeSeriesLength > 1 || forBackprop) {
        wFFTranspose = Shape.toMmulCompatible(wFFTranspose);
        wOOTranspose = Shape.toMmulCompatible(wOOTranspose);
        wGGTranspose = Shape.toMmulCompatible(wGGTranspose);
    }
    //Allocate arrays for activations:
    boolean sigmoidGates = gateActivationFn instanceof ActivationSigmoid;
    IActivation afn = conf.getLayer().getActivationFn();
    INDArray outputActivations = null;
    FwdPassReturn toReturn = new FwdPassReturn();
    if (forBackprop) {
        toReturn.fwdPassOutputAsArrays = new INDArray[timeSeriesLength];
        toReturn.memCellState = new INDArray[timeSeriesLength];
        toReturn.memCellActivations = new INDArray[timeSeriesLength];
        toReturn.iz = new INDArray[timeSeriesLength];
        toReturn.ia = new INDArray[timeSeriesLength];
        toReturn.fa = new INDArray[timeSeriesLength];
        toReturn.oa = new INDArray[timeSeriesLength];
        toReturn.ga = new INDArray[timeSeriesLength];
        if (!sigmoidGates) {
            toReturn.fz = new INDArray[timeSeriesLength];
            toReturn.oz = new INDArray[timeSeriesLength];
            toReturn.gz = new INDArray[timeSeriesLength];
        }
    } else {
        //F order to keep time steps together
        outputActivations = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize, timeSeriesLength }, 'f');
        toReturn.fwdPassOutput = outputActivations;
    }
    Level1 l1BLAS = Nd4j.getBlasWrapper().level1();
    //Input validation: check input data matches nIn
    if (input.size(1) != inputWeights.size(0)) {
        throw new DL4JInvalidInputException("Received input with size(1) = " + input.size(1) + " (input array shape = " + Arrays.toString(input.shape()) + "); input.size(1) must match layer nIn size (nIn = " + inputWeights.size(0) + ")");
    }
    //These can be different if user forgets to call rnnClearPreviousState() between calls of rnnTimeStep
    if (prevOutputActivations != null && prevOutputActivations.size(0) != input.size(0)) {
        throw new DL4JInvalidInputException("Previous activations (stored state) number of examples = " + prevOutputActivations.size(0) + " but input array number of examples = " + input.size(0) + ". Possible cause: using rnnTimeStep() without calling" + " rnnClearPreviousState() between different sequences?");
    }
    //initialize prevOutputActivations to zeroes
    if (prevOutputActivations == null) {
        prevOutputActivations = Nd4j.zeros(new int[] { miniBatchSize, hiddenLayerSize });
    }
    for (int iTimeIndex = 0; iTimeIndex < timeSeriesLength; iTimeIndex++) {
        int time = iTimeIndex;
        if (!forwards) {
            time = timeSeriesLength - iTimeIndex - 1;
        }
        //[Expected shape: [m,nIn]. Also deals with edge case of T=1, with 'time series' data of shape [m,nIn], equiv. to [m,nIn,1]
        INDArray miniBatchData = (is2dInput ? input : input.tensorAlongDimension(time, 1, 0));
        miniBatchData = Shape.toMmulCompatible(miniBatchData);
        //Calculate activations for: network input + forget, output, input modulation gates. Next 3 lines are first part of those
        //Shape: [miniBatch,4*layerSize]
        INDArray ifogActivations = miniBatchData.mmul(inputWeights);
        Nd4j.gemm(prevOutputActivations, recurrentWeightsIFOG, ifogActivations, false, false, 1.0, 1.0);
        ifogActivations.addiRowVector(biases);
        INDArray inputActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
        if (forBackprop)
            toReturn.iz[time] = inputActivations.dup('f');
        conf.getLayer().getActivationFn().getActivation(inputActivations, training);
        if (forBackprop)
            toReturn.ia[time] = inputActivations;
        INDArray forgetGateActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(hiddenLayerSize, 2 * hiddenLayerSize));
        INDArray pmcellWFF = prevMemCellState.dup('f').muliRowVector(wFFTranspose);
        //y = a*x + y i.e., forgetGateActivations.addi(pmcellWFF)
        l1BLAS.axpy(pmcellWFF.length(), 1.0, pmcellWFF, forgetGateActivations);
        //Above line: treats matrix as a vector. Can only do this because we're sure both pwcelWFF and forgetGateACtivations are f order, offset 0 and have same strides
        if (forBackprop && !sigmoidGates) {
            //Forget gate pre-out (z)
            toReturn.fz[time] = forgetGateActivations.dup('f');
        }
        gateActivationFn.getActivation(forgetGateActivations, training);
        if (forBackprop)
            toReturn.fa[time] = forgetGateActivations;
        INDArray inputModGateActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(3 * hiddenLayerSize, 4 * hiddenLayerSize));
        INDArray pmcellWGG = prevMemCellState.dup('f').muliRowVector(wGGTranspose);
        //inputModGateActivations.addi(pmcellWGG)
        l1BLAS.axpy(pmcellWGG.length(), 1.0, pmcellWGG, inputModGateActivations);
        if (forBackprop && !sigmoidGates) {
            //Input modulation gate pre-out (z)
            toReturn.gz[time] = inputModGateActivations.dup('f');
        }
        gateActivationFn.getActivation(inputModGateActivations, training);
        if (forBackprop)
            toReturn.ga[time] = inputModGateActivations;
        //Memory cell state
        INDArray currentMemoryCellState;
        INDArray inputModMulInput;
        if (forBackprop) {
            currentMemoryCellState = prevMemCellState.dup('f').muli(forgetGateActivations);
            inputModMulInput = inputModGateActivations.dup('f').muli(inputActivations);
        } else {
            currentMemoryCellState = forgetGateActivations.muli(prevMemCellState);
            inputModMulInput = inputModGateActivations.muli(inputActivations);
        }
        //currentMemoryCellState.addi(inputModMulInput)
        l1BLAS.axpy(currentMemoryCellState.length(), 1.0, inputModMulInput, currentMemoryCellState);
        INDArray outputGateActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 3 * hiddenLayerSize));
        INDArray pmcellWOO = currentMemoryCellState.dup('f').muliRowVector(wOOTranspose);
        //outputGateActivations.addi(pmcellWOO)
        l1BLAS.axpy(pmcellWOO.length(), 1.0, pmcellWOO, outputGateActivations);
        if (forBackprop && !sigmoidGates) {
            //Output gate activations
            toReturn.oz[time] = outputGateActivations.dup('f');
        }
        gateActivationFn.getActivation(outputGateActivations, training);
        if (forBackprop)
            toReturn.oa[time] = outputGateActivations;
        //LSTM unit outputs:
        INDArray currMemoryCellActivation = afn.getActivation(currentMemoryCellState.dup('f'), training);
        INDArray currHiddenUnitActivations;
        if (forBackprop) {
            //Expected shape: [m,hiddenLayerSize]
            currHiddenUnitActivations = currMemoryCellActivation.dup('f').muli(outputGateActivations);
        } else {
            //Expected shape: [m,hiddenLayerSize]
            currHiddenUnitActivations = currMemoryCellActivation.muli(outputGateActivations);
        }
        if (maskArray != null) {
            //Mask array is present: bidirectional RNN -> need to zero out these activations to avoid
            // incorrectly using activations from masked time steps (i.e., want 0 initialization in both directions)
            //We *also* need to apply this to the memory cells, as they are carried forward
            //Mask array has shape [minibatch, timeSeriesLength] -> get column
            INDArray timeStepMaskColumn = maskArray.getColumn(time);
            currHiddenUnitActivations.muliColumnVector(timeStepMaskColumn);
            currentMemoryCellState.muliColumnVector(timeStepMaskColumn);
        }
        if (forBackprop) {
            toReturn.fwdPassOutputAsArrays[time] = currHiddenUnitActivations;
            toReturn.memCellState[time] = currentMemoryCellState;
            toReturn.memCellActivations[time] = currMemoryCellActivation;
        } else {
            outputActivations.tensorAlongDimension(time, 1, 0).assign(currHiddenUnitActivations);
        }
        prevOutputActivations = currHiddenUnitActivations;
        prevMemCellState = currentMemoryCellState;
        toReturn.lastAct = currHiddenUnitActivations;
        toReturn.lastMemCell = currentMemoryCellState;
    }
    return toReturn;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) ActivationSigmoid(org.nd4j.linalg.activations.impl.ActivationSigmoid) Level1(org.nd4j.linalg.api.blas.Level1) IActivation(org.nd4j.linalg.activations.IActivation) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) NDArrayIndex.point(org.nd4j.linalg.indexing.NDArrayIndex.point)

Aggregations

ActivationSigmoid (org.nd4j.linalg.activations.impl.ActivationSigmoid)4 INDArray (org.nd4j.linalg.api.ndarray.INDArray)4 Gradient (org.deeplearning4j.nn.gradient.Gradient)2 Test (org.junit.Test)2 IActivation (org.nd4j.linalg.activations.IActivation)2 Level1 (org.nd4j.linalg.api.blas.Level1)2 NDArrayIndex.point (org.nd4j.linalg.indexing.NDArrayIndex.point)2 Pair (org.deeplearning4j.berkeley.Pair)1 DL4JInvalidInputException (org.deeplearning4j.exception.DL4JInvalidInputException)1 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)1 NormalDistribution (org.deeplearning4j.nn.conf.distribution.NormalDistribution)1 UniformDistribution (org.deeplearning4j.nn.conf.distribution.UniformDistribution)1 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)1 TimesOneMinus (org.nd4j.linalg.api.ops.impl.transforms.TimesOneMinus)1 MulOp (org.nd4j.linalg.api.ops.impl.transforms.arithmetic.MulOp)1 Distribution (org.nd4j.linalg.api.rng.distribution.Distribution)1