use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.
the class LSTMHelpers method backpropGradientHelper.
public static Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final IActivation gateActivationFn, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray input, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray recurrentWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
final INDArray inputWeights, final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength, final FwdPassReturn fwdPass, final boolean forwards, final String inputWeightKey, final String recurrentWeightKey, final String biasWeightKey, //Input mask: should only be used with bidirectional RNNs + variable length
final Map<String, INDArray> gradientViews, //Input mask: should only be used with bidirectional RNNs + variable length
INDArray maskArray) {
//Expect errors to have shape: [miniBatchSize,n^(L+1),timeSeriesLength]
//i.e., n^L
int hiddenLayerSize = recurrentWeights.size(0);
//n^(L-1)
int prevLayerSize = inputWeights.size(0);
int miniBatchSize = epsilon.size(0);
//Edge case: T=1 may have shape [miniBatchSize,n^(L+1)], equiv. to [miniBatchSize,n^(L+1),1]
boolean is2dInput = epsilon.rank() < 3;
int timeSeriesLength = (is2dInput ? 1 : epsilon.size(2));
INDArray wFFTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize)).transpose();
INDArray wOOTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize + 1)).transpose();
INDArray wGGTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize + 2)).transpose();
INDArray wIFOG = recurrentWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize));
//F order here so that content for time steps are together
//i.e., what would be W^L*(delta^L)^T. Shape: [m,n^(L-1),T]
INDArray epsilonNext = Nd4j.create(new int[] { miniBatchSize, prevLayerSize, timeSeriesLength }, 'f');
INDArray nablaCellStateNext = null;
INDArray deltaifogNext = Nd4j.create(new int[] { miniBatchSize, 4 * hiddenLayerSize }, 'f');
INDArray deltaiNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
INDArray deltafNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(hiddenLayerSize, 2 * hiddenLayerSize));
INDArray deltaoNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 3 * hiddenLayerSize));
INDArray deltagNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(3 * hiddenLayerSize, 4 * hiddenLayerSize));
Level1 l1BLAS = Nd4j.getBlasWrapper().level1();
int endIdx = 0;
if (truncatedBPTT) {
endIdx = Math.max(0, timeSeriesLength - tbpttBackwardLength);
}
//Get gradients. Note that we have to manually zero these, as they might not be initialized (or still has data from last iteration)
//Also note that they are in f order (as per param initializer) so can be used in gemm etc
INDArray iwGradientsOut = gradientViews.get(inputWeightKey);
//Order: {I,F,O,G,FF,OO,GG}
INDArray rwGradientsOut = gradientViews.get(recurrentWeightKey);
INDArray bGradientsOut = gradientViews.get(biasWeightKey);
iwGradientsOut.assign(0);
rwGradientsOut.assign(0);
bGradientsOut.assign(0);
INDArray rwGradientsIFOG = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize));
INDArray rwGradientsFF = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize));
INDArray rwGradientsOO = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize + 1));
INDArray rwGradientsGG = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize + 2));
boolean sigmoidGates = gateActivationFn instanceof ActivationSigmoid;
IActivation afn = conf.getLayer().getActivationFn();
INDArray timeStepMaskColumn = null;
for (int iTimeIndex = timeSeriesLength - 1; iTimeIndex >= endIdx; iTimeIndex--) {
int time = iTimeIndex;
int inext = 1;
if (!forwards) {
time = timeSeriesLength - iTimeIndex - 1;
inext = -1;
}
//First: calclate the components of nablaCellState that relies on the next time step deltas, so we can overwrite the deltas
INDArray nablaCellState;
if (iTimeIndex != timeSeriesLength - 1) {
nablaCellState = deltafNext.dup('f').muliRowVector(wFFTranspose);
l1BLAS.axpy(nablaCellState.length(), 1.0, deltagNext.dup('f').muliRowVector(wGGTranspose), nablaCellState);
} else {
nablaCellState = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize }, 'f');
}
INDArray prevMemCellState = (iTimeIndex == 0 ? null : fwdPass.memCellState[time - inext]);
INDArray prevHiddenUnitActivation = (iTimeIndex == 0 ? null : fwdPass.fwdPassOutputAsArrays[time - inext]);
INDArray currMemCellState = fwdPass.memCellState[time];
//LSTM unit output errors (dL/d(a_out)); not to be confused with \delta=dL/d(z_out)
//(w^{L+1}*(delta^{(L+1)t})^T)^T or equiv.
INDArray epsilonSlice = (is2dInput ? epsilon : epsilon.tensorAlongDimension(time, 1, 0));
//Shape: [m,n^L]
INDArray nablaOut = Shape.toOffsetZeroCopy(epsilonSlice, 'f');
if (iTimeIndex != timeSeriesLength - 1) {
//if t == timeSeriesLength-1 then deltaiNext etc are zeros
Nd4j.gemm(deltaifogNext, wIFOG, nablaOut, false, true, 1.0, 1.0);
}
//Output gate deltas:
INDArray sigmahOfS = fwdPass.memCellActivations[time];
INDArray ao = fwdPass.oa[time];
//Normally would use zo.dup() in above line, but won't be using zo again (for this time step). Ditto for zf, zg, zi
INDArray deltao = deltaoNext;
Nd4j.getExecutioner().exec(new MulOp(nablaOut, sigmahOfS, deltao));
if (sigmoidGates) {
//Equivalent to sigmoid deriv on zo
INDArray sigmaoPrimeOfZo = Nd4j.getExecutioner().execAndReturn(new TimesOneMinus(ao.dup('f')));
deltao.muli(sigmaoPrimeOfZo);
} else {
//Deltao needs to be modified in-place
deltao.assign(gateActivationFn.backprop(fwdPass.oz[time], deltao).getFirst());
//TODO: optimize (no assign)
}
//Memory cell error:
//TODO activation functions with params
INDArray temp = afn.backprop(currMemCellState.dup('f'), ao.muli(nablaOut)).getFirst();
l1BLAS.axpy(nablaCellState.length(), 1.0, temp, nablaCellState);
INDArray deltaMulRowWOO = deltao.dup('f').muliRowVector(wOOTranspose);
//nablaCellState.addi(deltao.mulRowVector(wOOTranspose));
l1BLAS.axpy(nablaCellState.length(), 1.0, deltaMulRowWOO, nablaCellState);
if (iTimeIndex != timeSeriesLength - 1) {
INDArray nextForgetGateAs = fwdPass.fa[time + inext];
int length = nablaCellState.length();
//nablaCellState.addi(nextForgetGateAs.mul(nablaCellStateNext))
l1BLAS.axpy(length, 1.0, nextForgetGateAs.muli(nablaCellStateNext), nablaCellState);
}
//Store for use in next iteration
nablaCellStateNext = nablaCellState;
//Forget gate delta:
INDArray af = fwdPass.fa[time];
INDArray deltaf = null;
if (iTimeIndex > 0) {
deltaf = deltafNext;
if (sigmoidGates) {
Nd4j.getExecutioner().exec(new TimesOneMinus(af, deltaf));
deltaf.muli(nablaCellState);
deltaf.muli(prevMemCellState);
} else {
INDArray temp2 = nablaCellState.mul(prevMemCellState);
//deltaf needs to be modified in-place
deltaf.assign(gateActivationFn.backprop(fwdPass.fz[time].dup('f'), temp2).getFirst());
//TODO activation functions with params
}
}
//Shape: [m,n^L]
//Input modulation gate delta:
INDArray ag = fwdPass.ga[time];
INDArray ai = fwdPass.ia[time];
INDArray deltag = deltagNext;
if (sigmoidGates) {
//Equivalent to sigmoid deriv on zg
Nd4j.getExecutioner().exec(new TimesOneMinus(ag, deltag));
deltag.muli(ai);
deltag.muli(nablaCellState);
} else {
INDArray temp2 = Nd4j.getExecutioner().execAndReturn(new MulOp(ai, nablaCellState, Nd4j.createUninitialized(ai.shape(), 'f')));
deltag.assign(gateActivationFn.backprop(fwdPass.gz[time], temp2).getFirst());
//TODO activation functions with params; optimize (no assign)
}
//Shape: [m,n^L]
//Network input delta:
INDArray zi = fwdPass.iz[time];
INDArray deltai = deltaiNext;
temp = Nd4j.getExecutioner().execAndReturn(new MulOp(ag, nablaCellState, Nd4j.createUninitialized(deltai.shape(), 'f')));
deltai.assign(afn.backprop(zi, temp).getFirst());
//Handle masking
if (maskArray != null) {
//Mask array is present: bidirectional RNN -> need to zero out these errors to avoid using errors from a masked time step
// to calculate the parameter gradients. Mask array has shape [minibatch, timeSeriesLength] -> get column(this time step)
timeStepMaskColumn = maskArray.getColumn(time);
deltaifogNext.muliColumnVector(timeStepMaskColumn);
//Later, the deltaifogNext is used to calculate: input weight gradients, recurrent weight gradients, bias gradients
}
INDArray prevLayerActivationSlice = Shape.toMmulCompatible(is2dInput ? input : input.tensorAlongDimension(time, 1, 0));
if (iTimeIndex > 0) {
//Again, deltaifog_current == deltaifogNext at this point... same array
Nd4j.gemm(prevLayerActivationSlice, deltaifogNext, iwGradientsOut, true, false, 1.0, 1.0);
} else {
INDArray iwGradients_i = iwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
Nd4j.gemm(prevLayerActivationSlice, deltai, iwGradients_i, true, false, 1.0, 1.0);
INDArray iwGradients_og = iwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
INDArray deltaog = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
Nd4j.gemm(prevLayerActivationSlice, deltaog, iwGradients_og, true, false, 1.0, 1.0);
}
if (iTimeIndex > 0) {
//If t==0, then prevHiddenUnitActivation==zeros(n^L,n^L), so dL/dW for recurrent weights will end up as 0 anyway
//At this point: deltaifog and deltaifogNext are the same thing...
//So what we are actually doing here is sum of (prevAct^transpose * deltaifog_current)
Nd4j.gemm(prevHiddenUnitActivation, deltaifogNext, rwGradientsIFOG, true, false, 1.0, 1.0);
//Shape: [1,n^L]. sum(0) is sum over examples in mini-batch.
//Can use axpy here because result of sum and rwGradients[4 to 6] have order Nd4j.order(), via Nd4j.create()
//mul not mmul because these weights are from unit j->j only (whereas other recurrent weights are i->j for all i,j)
INDArray dLdwFF = deltaf.dup('f').muli(prevMemCellState).sum(0);
//rwGradients[4].addi(dLdwFF); //dL/dw_{FF}
l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwFF, rwGradientsFF);
INDArray dLdwGG = deltag.dup('f').muli(prevMemCellState).sum(0);
//rwGradients[6].addi(dLdwGG);
l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwGG, rwGradientsGG);
}
//Expected shape: [n^L,1]. sum(0) is sum over examples in mini-batch.
INDArray dLdwOO = deltao.dup('f').muli(currMemCellState).sum(0);
//rwGradients[5].addi(dLdwOO); //dL/dw_{OOxy}
l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwOO, rwGradientsOO);
if (iTimeIndex > 0) {
l1BLAS.axpy(4 * hiddenLayerSize, 1.0, deltaifogNext.sum(0), bGradientsOut);
} else {
//Sneaky way to do bGradients_i += deltai.sum(0)
l1BLAS.axpy(hiddenLayerSize, 1.0, deltai.sum(0), bGradientsOut);
INDArray ogBiasToAdd = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize)).sum(0);
INDArray ogBiasGrad = bGradientsOut.get(NDArrayIndex.point(0), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
l1BLAS.axpy(2 * hiddenLayerSize, 1.0, ogBiasToAdd, ogBiasGrad);
}
//Calculate epsilonNext - i.e., equiv. to what would be (w^L*(d^(Lt))^T)^T in a normal network
//But here, need to add 4 weights * deltas for the IFOG gates
//This slice: f order and contiguous, due to epsilonNext being defined as f order.
INDArray epsilonNextSlice = epsilonNext.tensorAlongDimension(time, 1, 0);
if (iTimeIndex > 0) {
Nd4j.gemm(deltaifogNext, inputWeights, epsilonNextSlice, false, true, 1.0, 1.0);
} else {
//No contribution from forget gate at t=0
INDArray wi = inputWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
Nd4j.gemm(deltai, wi, epsilonNextSlice, false, true, 1.0, 1.0);
INDArray deltaog = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
INDArray wog = inputWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
//epsilonNextSlice.addi(deltao.mmul(woTranspose)).addi(deltag.mmul(wgTranspose));
Nd4j.gemm(deltaog, wog, epsilonNextSlice, false, true, 1.0, 1.0);
}
if (maskArray != null) {
//Mask array is present: bidirectional RNN -> need to zero out these errors to avoid sending anything
// but 0s to the layer below at this time step (for the given example)
epsilonNextSlice.muliColumnVector(timeStepMaskColumn);
}
}
Gradient retGradient = new DefaultGradient();
retGradient.gradientForVariable().put(inputWeightKey, iwGradientsOut);
retGradient.gradientForVariable().put(recurrentWeightKey, rwGradientsOut);
retGradient.gradientForVariable().put(biasWeightKey, bGradientsOut);
return new Pair<>(retGradient, epsilonNext);
}
use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.
the class GravesBidirectionalLSTMTest method testGravesBidirectionalLSTMForwardPassHelper.
@Test
public void testGravesBidirectionalLSTMForwardPassHelper() throws Exception {
//GravesBidirectionalLSTM.activateHelper() has different behaviour (due to optimizations) when forBackprop==true vs false
//But should otherwise provide identical activations
Nd4j.getRandom().setSeed(12345);
final int nIn = 10;
final int layerSize = 15;
final int miniBatchSize = 4;
final int timeSeriesLength = 7;
final NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().layer(new org.deeplearning4j.nn.conf.layers.GravesBidirectionalLSTM.Builder().nIn(nIn).nOut(layerSize).weightInit(WeightInit.DISTRIBUTION).dist(new UniformDistribution(0, 1)).activation(Activation.TANH).build()).build();
int numParams = conf.getLayer().initializer().numParams(conf);
INDArray params = Nd4j.create(1, numParams);
final GravesBidirectionalLSTM lstm = (GravesBidirectionalLSTM) conf.getLayer().instantiate(conf, null, 0, params, true);
final INDArray input = Nd4j.rand(new int[] { miniBatchSize, nIn, timeSeriesLength });
lstm.setInput(input);
final INDArray fwdPassFalse = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(), lstm.input(), lstm.getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null, false, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null).fwdPassOutput;
final INDArray[] fwdPassTrue = LSTMHelpers.activateHelper(lstm, lstm.conf(), new ActivationSigmoid(), lstm.input(), lstm.getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), lstm.getParam(GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS), false, null, null, true, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, null).fwdPassOutputAsArrays;
//I have no idea what the heck this does --Ben
for (int i = 0; i < timeSeriesLength; i++) {
final INDArray sliceFalse = fwdPassFalse.tensorAlongDimension(i, 1, 0);
final INDArray sliceTrue = fwdPassTrue[i];
assertTrue(sliceFalse.equals(sliceTrue));
}
}
use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.
the class RBMTests method testComputeGradientAndScore.
@Test
public void testComputeGradientAndScore() {
INDArray input = Nd4j.linspace(1, 10, 10);
INDArray params = getStandardParams(10, 5);
RBM rbm = getRBMLayer(10, 5, HiddenUnit.BINARY, VisibleUnit.BINARY, params, true, false, 1, LossFunctions.LossFunction.MSE);
rbm.setInput(input);
rbm.computeGradientAndScore();
Pair<Gradient, Double> pair = rbm.gradientAndScore();
INDArray hprob = sigmoid(input.mmul(rbm.getParam(PretrainParamInitializer.WEIGHT_KEY)).addiRowVector(rbm.getParam(PretrainParamInitializer.BIAS_KEY)));
INDArray vprob = sigmoid(hprob.mmul(rbm.getParam(PretrainParamInitializer.WEIGHT_KEY).transpose()).addiRowVector(rbm.getParam(PretrainParamInitializer.VISIBLE_BIAS_KEY)));
Distribution dist = Nd4j.getDistributions().createBinomial(1, vprob);
dist.reseedRandomGenerator(42);
INDArray vSample = dist.sample(vprob.shape());
//double expectedScore = LossFunctions.LossFunction.MSE.getILossFunction().computeScore(input, vSample, "sigmoid", null, false);
double expectedScore = LossFunctions.LossFunction.MSE.getILossFunction().computeScore(input, vSample, new ActivationSigmoid(), null, false);
assertEquals(expectedScore, pair.getSecond(), 1e-8);
}
use of org.nd4j.linalg.activations.impl.ActivationSigmoid in project deeplearning4j by deeplearning4j.
the class LSTMHelpers method activateHelper.
/**
* Returns FwdPassReturn object with activations/INDArrays. Allows activateHelper to be used for forward pass, backward pass
* and rnnTimeStep whilst being reasonably efficient for all
*/
public static FwdPassReturn activateHelper(final Layer layer, final NeuralNetConfiguration conf, //Activation function for the gates - sigmoid or hard sigmoid (must be found in range 0 to 1)
final IActivation gateActivationFn, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray input, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray recurrentWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
final INDArray originalInputWeights, //Shape: [4,hiddenLayerSize]; order: [bi,bf,bo,bg]^T
final INDArray biases, final boolean training, final INDArray originalPrevOutputActivations, final INDArray originalPrevMemCellState, boolean forBackprop, boolean forwards, //Input mask: should only be used with bidirectional RNNs + variable length
final String inputWeightKey, //Input mask: should only be used with bidirectional RNNs + variable length
INDArray maskArray) {
//Data has shape [m,nIn,T]. Layer activations/output has shape [m,nHiddenUnits,T]
if (input == null || input.length() == 0)
throw new IllegalArgumentException("Invalid input: not set or 0 length");
INDArray inputWeights = originalInputWeights;
INDArray prevOutputActivations = originalPrevOutputActivations;
//Edge case of T=1, may have shape [m,nIn], equiv. to [m,nIn,1]
boolean is2dInput = input.rank() < 3;
int timeSeriesLength = (is2dInput ? 1 : input.size(2));
int hiddenLayerSize = recurrentWeights.size(0);
int miniBatchSize = input.size(0);
INDArray prevMemCellState;
if (originalPrevMemCellState == null) {
prevMemCellState = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize }, 'f');
} else {
prevMemCellState = originalPrevMemCellState.dup('f');
}
INDArray recurrentWeightsIFOG = recurrentWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize)).dup('f');
//Apply dropconnect to input (not recurrent) weights only:
if (conf.isUseDropConnect() && training && conf.getLayer().getDropOut() > 0) {
inputWeights = Dropout.applyDropConnect(layer, inputWeightKey);
}
INDArray wFFTranspose = recurrentWeights.get(NDArrayIndex.all(), interval(4 * hiddenLayerSize, 4 * hiddenLayerSize + 1)).transpose();
INDArray wOOTranspose = recurrentWeights.get(NDArrayIndex.all(), interval(4 * hiddenLayerSize + 1, 4 * hiddenLayerSize + 2)).transpose();
INDArray wGGTranspose = recurrentWeights.get(NDArrayIndex.all(), interval(4 * hiddenLayerSize + 2, 4 * hiddenLayerSize + 3)).transpose();
if (timeSeriesLength > 1 || forBackprop) {
wFFTranspose = Shape.toMmulCompatible(wFFTranspose);
wOOTranspose = Shape.toMmulCompatible(wOOTranspose);
wGGTranspose = Shape.toMmulCompatible(wGGTranspose);
}
//Allocate arrays for activations:
boolean sigmoidGates = gateActivationFn instanceof ActivationSigmoid;
IActivation afn = conf.getLayer().getActivationFn();
INDArray outputActivations = null;
FwdPassReturn toReturn = new FwdPassReturn();
if (forBackprop) {
toReturn.fwdPassOutputAsArrays = new INDArray[timeSeriesLength];
toReturn.memCellState = new INDArray[timeSeriesLength];
toReturn.memCellActivations = new INDArray[timeSeriesLength];
toReturn.iz = new INDArray[timeSeriesLength];
toReturn.ia = new INDArray[timeSeriesLength];
toReturn.fa = new INDArray[timeSeriesLength];
toReturn.oa = new INDArray[timeSeriesLength];
toReturn.ga = new INDArray[timeSeriesLength];
if (!sigmoidGates) {
toReturn.fz = new INDArray[timeSeriesLength];
toReturn.oz = new INDArray[timeSeriesLength];
toReturn.gz = new INDArray[timeSeriesLength];
}
} else {
//F order to keep time steps together
outputActivations = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize, timeSeriesLength }, 'f');
toReturn.fwdPassOutput = outputActivations;
}
Level1 l1BLAS = Nd4j.getBlasWrapper().level1();
//Input validation: check input data matches nIn
if (input.size(1) != inputWeights.size(0)) {
throw new DL4JInvalidInputException("Received input with size(1) = " + input.size(1) + " (input array shape = " + Arrays.toString(input.shape()) + "); input.size(1) must match layer nIn size (nIn = " + inputWeights.size(0) + ")");
}
//These can be different if user forgets to call rnnClearPreviousState() between calls of rnnTimeStep
if (prevOutputActivations != null && prevOutputActivations.size(0) != input.size(0)) {
throw new DL4JInvalidInputException("Previous activations (stored state) number of examples = " + prevOutputActivations.size(0) + " but input array number of examples = " + input.size(0) + ". Possible cause: using rnnTimeStep() without calling" + " rnnClearPreviousState() between different sequences?");
}
//initialize prevOutputActivations to zeroes
if (prevOutputActivations == null) {
prevOutputActivations = Nd4j.zeros(new int[] { miniBatchSize, hiddenLayerSize });
}
for (int iTimeIndex = 0; iTimeIndex < timeSeriesLength; iTimeIndex++) {
int time = iTimeIndex;
if (!forwards) {
time = timeSeriesLength - iTimeIndex - 1;
}
//[Expected shape: [m,nIn]. Also deals with edge case of T=1, with 'time series' data of shape [m,nIn], equiv. to [m,nIn,1]
INDArray miniBatchData = (is2dInput ? input : input.tensorAlongDimension(time, 1, 0));
miniBatchData = Shape.toMmulCompatible(miniBatchData);
//Calculate activations for: network input + forget, output, input modulation gates. Next 3 lines are first part of those
//Shape: [miniBatch,4*layerSize]
INDArray ifogActivations = miniBatchData.mmul(inputWeights);
Nd4j.gemm(prevOutputActivations, recurrentWeightsIFOG, ifogActivations, false, false, 1.0, 1.0);
ifogActivations.addiRowVector(biases);
INDArray inputActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
if (forBackprop)
toReturn.iz[time] = inputActivations.dup('f');
conf.getLayer().getActivationFn().getActivation(inputActivations, training);
if (forBackprop)
toReturn.ia[time] = inputActivations;
INDArray forgetGateActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(hiddenLayerSize, 2 * hiddenLayerSize));
INDArray pmcellWFF = prevMemCellState.dup('f').muliRowVector(wFFTranspose);
//y = a*x + y i.e., forgetGateActivations.addi(pmcellWFF)
l1BLAS.axpy(pmcellWFF.length(), 1.0, pmcellWFF, forgetGateActivations);
//Above line: treats matrix as a vector. Can only do this because we're sure both pwcelWFF and forgetGateACtivations are f order, offset 0 and have same strides
if (forBackprop && !sigmoidGates) {
//Forget gate pre-out (z)
toReturn.fz[time] = forgetGateActivations.dup('f');
}
gateActivationFn.getActivation(forgetGateActivations, training);
if (forBackprop)
toReturn.fa[time] = forgetGateActivations;
INDArray inputModGateActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(3 * hiddenLayerSize, 4 * hiddenLayerSize));
INDArray pmcellWGG = prevMemCellState.dup('f').muliRowVector(wGGTranspose);
//inputModGateActivations.addi(pmcellWGG)
l1BLAS.axpy(pmcellWGG.length(), 1.0, pmcellWGG, inputModGateActivations);
if (forBackprop && !sigmoidGates) {
//Input modulation gate pre-out (z)
toReturn.gz[time] = inputModGateActivations.dup('f');
}
gateActivationFn.getActivation(inputModGateActivations, training);
if (forBackprop)
toReturn.ga[time] = inputModGateActivations;
//Memory cell state
INDArray currentMemoryCellState;
INDArray inputModMulInput;
if (forBackprop) {
currentMemoryCellState = prevMemCellState.dup('f').muli(forgetGateActivations);
inputModMulInput = inputModGateActivations.dup('f').muli(inputActivations);
} else {
currentMemoryCellState = forgetGateActivations.muli(prevMemCellState);
inputModMulInput = inputModGateActivations.muli(inputActivations);
}
//currentMemoryCellState.addi(inputModMulInput)
l1BLAS.axpy(currentMemoryCellState.length(), 1.0, inputModMulInput, currentMemoryCellState);
INDArray outputGateActivations = ifogActivations.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 3 * hiddenLayerSize));
INDArray pmcellWOO = currentMemoryCellState.dup('f').muliRowVector(wOOTranspose);
//outputGateActivations.addi(pmcellWOO)
l1BLAS.axpy(pmcellWOO.length(), 1.0, pmcellWOO, outputGateActivations);
if (forBackprop && !sigmoidGates) {
//Output gate activations
toReturn.oz[time] = outputGateActivations.dup('f');
}
gateActivationFn.getActivation(outputGateActivations, training);
if (forBackprop)
toReturn.oa[time] = outputGateActivations;
//LSTM unit outputs:
INDArray currMemoryCellActivation = afn.getActivation(currentMemoryCellState.dup('f'), training);
INDArray currHiddenUnitActivations;
if (forBackprop) {
//Expected shape: [m,hiddenLayerSize]
currHiddenUnitActivations = currMemoryCellActivation.dup('f').muli(outputGateActivations);
} else {
//Expected shape: [m,hiddenLayerSize]
currHiddenUnitActivations = currMemoryCellActivation.muli(outputGateActivations);
}
if (maskArray != null) {
//Mask array is present: bidirectional RNN -> need to zero out these activations to avoid
// incorrectly using activations from masked time steps (i.e., want 0 initialization in both directions)
//We *also* need to apply this to the memory cells, as they are carried forward
//Mask array has shape [minibatch, timeSeriesLength] -> get column
INDArray timeStepMaskColumn = maskArray.getColumn(time);
currHiddenUnitActivations.muliColumnVector(timeStepMaskColumn);
currentMemoryCellState.muliColumnVector(timeStepMaskColumn);
}
if (forBackprop) {
toReturn.fwdPassOutputAsArrays[time] = currHiddenUnitActivations;
toReturn.memCellState[time] = currentMemoryCellState;
toReturn.memCellActivations[time] = currMemoryCellActivation;
} else {
outputActivations.tensorAlongDimension(time, 1, 0).assign(currHiddenUnitActivations);
}
prevOutputActivations = currHiddenUnitActivations;
prevMemCellState = currentMemoryCellState;
toReturn.lastAct = currHiddenUnitActivations;
toReturn.lastMemCell = currentMemoryCellState;
}
return toReturn;
}
Aggregations