use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class VariationalAutoencoder method backpropGradient.
@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
if (!zeroedPretrainParamGradients) {
for (Map.Entry<String, INDArray> entry : gradientViews.entrySet()) {
if (isPretrainParam(entry.getKey())) {
entry.getValue().assign(0);
}
}
zeroedPretrainParamGradients = true;
}
Gradient gradient = new DefaultGradient();
VAEFwdHelper fwd = doForward(true, true);
INDArray currentDelta = pzxActivationFn.backprop(fwd.pzxMeanPreOut, epsilon).getFirst();
//Finally, calculate mean value:
INDArray meanW = params.get(VariationalAutoencoderParamInitializer.PZX_MEAN_W);
//f order
INDArray dLdMeanW = gradientViews.get(VariationalAutoencoderParamInitializer.PZX_MEAN_W);
INDArray lastEncoderActivation = fwd.encoderActivations[fwd.encoderActivations.length - 1];
Nd4j.gemm(lastEncoderActivation, currentDelta, dLdMeanW, true, false, 1.0, 0.0);
INDArray dLdMeanB = gradientViews.get(VariationalAutoencoderParamInitializer.PZX_MEAN_B);
//TODO: do this without the assign
dLdMeanB.assign(currentDelta.sum(0));
gradient.gradientForVariable().put(VariationalAutoencoderParamInitializer.PZX_MEAN_W, dLdMeanW);
gradient.gradientForVariable().put(VariationalAutoencoderParamInitializer.PZX_MEAN_B, dLdMeanB);
epsilon = meanW.mmul(currentDelta.transpose()).transpose();
int nEncoderLayers = encoderLayerSizes.length;
IActivation afn = conf().getLayer().getActivationFn();
for (int i = nEncoderLayers - 1; i >= 0; i--) {
String wKey = "e" + i + WEIGHT_KEY_SUFFIX;
String bKey = "e" + i + BIAS_KEY_SUFFIX;
INDArray weights = params.get(wKey);
INDArray dLdW = gradientViews.get(wKey);
INDArray dLdB = gradientViews.get(bKey);
INDArray preOut = fwd.encoderPreOuts[i];
currentDelta = afn.backprop(preOut, epsilon).getFirst();
INDArray actInput;
if (i == 0) {
actInput = input;
} else {
actInput = fwd.encoderActivations[i - 1];
}
Nd4j.gemm(actInput, currentDelta, dLdW, true, false, 1.0, 0.0);
//TODO: do this without the assign
dLdB.assign(currentDelta.sum(0));
gradient.gradientForVariable().put(wKey, dLdW);
gradient.gradientForVariable().put(bKey, dLdB);
epsilon = weights.mmul(currentDelta.transpose()).transpose();
}
return new Pair<>(gradient, epsilon);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class GravesBidirectionalLSTM method backpropGradientHelper.
private Pair<Gradient, INDArray> backpropGradientHelper(final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength) {
if (truncatedBPTT) {
throw new UnsupportedOperationException("you can not time step a bidirectional RNN, it has to run on a batch of data all at once");
}
final FwdPassReturn fwdPass = activateHelperDirectional(true, null, null, true, true);
final Pair<Gradient, INDArray> forwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input, getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS), getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS), epsilon, truncatedBPTT, tbpttBackwardLength, fwdPass, true, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_FORWARDS, GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_FORWARDS, GravesBidirectionalLSTMParamInitializer.BIAS_KEY_FORWARDS, gradientViews, maskArray);
final FwdPassReturn backPass = activateHelperDirectional(true, null, null, true, false);
final Pair<Gradient, INDArray> backwardsGradient = LSTMHelpers.backpropGradientHelper(this.conf, this.layerConf().getGateActivationFn(), this.input, getParam(GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS), getParam(GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS), epsilon, truncatedBPTT, tbpttBackwardLength, backPass, false, GravesBidirectionalLSTMParamInitializer.INPUT_WEIGHT_KEY_BACKWARDS, GravesBidirectionalLSTMParamInitializer.RECURRENT_WEIGHT_KEY_BACKWARDS, GravesBidirectionalLSTMParamInitializer.BIAS_KEY_BACKWARDS, gradientViews, maskArray);
//merge the gradient, which is key value pair of String,INDArray
//the keys for forwards and backwards should be different
final Gradient combinedGradient = new DefaultGradient();
for (Map.Entry<String, INDArray> entry : forwardsGradient.getFirst().gradientForVariable().entrySet()) {
combinedGradient.setGradientFor(entry.getKey(), entry.getValue());
}
for (Map.Entry<String, INDArray> entry : backwardsGradient.getFirst().gradientForVariable().entrySet()) {
combinedGradient.setGradientFor(entry.getKey(), entry.getValue());
}
final Gradient correctOrderedGradient = new DefaultGradient();
for (final String key : params.keySet()) {
correctOrderedGradient.setGradientFor(key, combinedGradient.getGradientFor(key));
}
final INDArray forwardEpsilon = forwardsGradient.getSecond();
final INDArray backwardsEpsilon = backwardsGradient.getSecond();
final INDArray combinedEpsilon = forwardEpsilon.addi(backwardsEpsilon);
//sum the errors that were back-propagated
return new Pair<>(correctOrderedGradient, combinedEpsilon);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class CenterLossOutputLayer method getGradientsAndDelta.
/** Returns tuple: {Gradient,Delta,Output} given preOut */
private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray preOut) {
ILossFunction lossFunction = layerConf().getLossFn();
INDArray labels2d = getLabels2d();
if (labels2d.size(1) != preOut.size(1)) {
throw new DL4JInvalidInputException("Labels array numColumns (size(1) = " + labels2d.size(1) + ") does not match output layer" + " number of outputs (nOut = " + preOut.size(1) + ")");
}
INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFn(), maskArray);
Gradient gradient = new DefaultGradient();
INDArray weightGradView = gradientViews.get(CenterLossParamInitializer.WEIGHT_KEY);
INDArray biasGradView = gradientViews.get(CenterLossParamInitializer.BIAS_KEY);
INDArray centersGradView = gradientViews.get(CenterLossParamInitializer.CENTER_KEY);
// centers delta
double alpha = layerConf().getAlpha();
INDArray centers = params.get(CenterLossParamInitializer.CENTER_KEY);
INDArray centersForExamples = labels.mmul(centers);
INDArray diff = centersForExamples.sub(input).muli(alpha);
INDArray numerator = labels.transpose().mmul(diff);
INDArray denominator = labels.sum(0).addi(1.0).transpose();
INDArray deltaC;
if (layerConf().getGradientCheck()) {
double lambda = layerConf().getLambda();
//For gradient checks: need to multiply dLc/dcj by lambda to get dL/dcj
deltaC = numerator.muli(lambda);
} else {
deltaC = numerator.diviColumnVector(denominator);
}
centersGradView.assign(deltaC);
// other standard calculations
//Equivalent to: weightGradView.assign(input.transpose().mmul(delta));
Nd4j.gemm(input, delta, weightGradView, true, false, 1.0, 0.0);
biasGradView.assign(delta.sum(0));
gradient.gradientForVariable().put(CenterLossParamInitializer.WEIGHT_KEY, weightGradView);
gradient.gradientForVariable().put(CenterLossParamInitializer.BIAS_KEY, biasGradView);
gradient.gradientForVariable().put(CenterLossParamInitializer.CENTER_KEY, centersGradView);
return new Pair<>(gradient, delta);
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class MultiLayerNetwork method truncatedBPTTGradient.
/** Equivalent to backprop(), but calculates gradient for truncated BPTT instead. */
protected void truncatedBPTTGradient() {
if (flattenedGradients == null)
initGradientsView();
String multiGradientKey;
gradient = new DefaultGradient();
Layer currLayer;
if (!(getOutputLayer() instanceof IOutputLayer)) {
log.warn("Warning: final layer isn't output layer. You cannot use backprop (truncated BPTT) without an output layer.");
return;
}
IOutputLayer outputLayer = (IOutputLayer) getOutputLayer();
if (labels == null)
throw new IllegalStateException("No labels found");
if (outputLayer.conf().getLayer().getWeightInit() == WeightInit.ZERO) {
throw new IllegalStateException("Output layer weights cannot be initialized to zero when using backprop.");
}
outputLayer.setLabels(labels);
//calculate and apply the backward gradient for every layer
int numLayers = getnLayers();
//Store gradients is a list; used to ensure iteration order in DefaultGradient linked hash map. i.e., layer 0 first instead of output layer
LinkedList<Pair<String, INDArray>> gradientList = new LinkedList<>();
Pair<Gradient, INDArray> currPair = outputLayer.backpropGradient(null);
for (Map.Entry<String, INDArray> entry : currPair.getFirst().gradientForVariable().entrySet()) {
multiGradientKey = String.valueOf(numLayers - 1) + "_" + entry.getKey();
gradientList.addLast(new Pair<>(multiGradientKey, entry.getValue()));
}
if (getLayerWiseConfigurations().getInputPreProcess(numLayers - 1) != null)
currPair = new Pair<>(currPair.getFirst(), this.layerWiseConfigurations.getInputPreProcess(numLayers - 1).backprop(currPair.getSecond(), getInputMiniBatchSize()));
// Calculate gradients for previous layers & drops output layer in count
for (int j = numLayers - 2; j >= 0; j--) {
currLayer = getLayer(j);
if (currLayer instanceof RecurrentLayer) {
currPair = ((RecurrentLayer) currLayer).tbpttBackpropGradient(currPair.getSecond(), layerWiseConfigurations.getTbpttBackLength());
} else {
currPair = currLayer.backpropGradient(currPair.getSecond());
}
LinkedList<Pair<String, INDArray>> tempList = new LinkedList<>();
for (Map.Entry<String, INDArray> entry : currPair.getFirst().gradientForVariable().entrySet()) {
multiGradientKey = String.valueOf(j) + "_" + entry.getKey();
tempList.addFirst(new Pair<>(multiGradientKey, entry.getValue()));
}
for (Pair<String, INDArray> pair : tempList) gradientList.addFirst(pair);
//Pass epsilon through input processor before passing to next layer (if applicable)
if (getLayerWiseConfigurations().getInputPreProcess(j) != null)
currPair = new Pair<>(currPair.getFirst(), getLayerWiseConfigurations().getInputPreProcess(j).backprop(currPair.getSecond(), getInputMiniBatchSize()));
}
//Add gradients to Gradients, in correct order
for (Pair<String, INDArray> pair : gradientList) gradient.setGradientFor(pair.getFirst(), pair.getSecond());
}
use of org.deeplearning4j.nn.gradient.DefaultGradient in project deeplearning4j by deeplearning4j.
the class MultiLayerNetwork method calcBackpropGradients.
/** Calculate gradients and errors. Used in two places:
* (a) backprop (for standard multi layer network learning)
* (b) backpropGradient (layer method, for when MultiLayerNetwork is used as a layer)
* @param epsilon Errors (technically errors .* activations). Not used if withOutputLayer = true
* @param withOutputLayer if true: assume last layer is output layer, and calculate errors based on labels. In this
* case, the epsilon input is not used (may/should be null).
* If false: calculate backprop gradients
* @return Gradients and the error (epsilon) at the input
*/
protected Pair<Gradient, INDArray> calcBackpropGradients(INDArray epsilon, boolean withOutputLayer) {
if (flattenedGradients == null)
initGradientsView();
String multiGradientKey;
Gradient gradient = new DefaultGradient(flattenedGradients);
Layer currLayer;
//calculate and apply the backward gradient for every layer
/**
* Skip the output layer for the indexing and just loop backwards updating the coefficients for each layer.
* (when withOutputLayer == true)
*
* Activate applies the activation function for each layer and sets that as the input for the following layer.
*
* Typical literature contains most trivial case for the error calculation: wT * weights
* This interpretation transpose a few things to get mini batch because ND4J is rows vs columns organization for params
*/
int numLayers = getnLayers();
//Store gradients is a list; used to ensure iteration order in DefaultGradient linked hash map. i.e., layer 0 first instead of output layer
LinkedList<Triple<String, INDArray, Character>> gradientList = new LinkedList<>();
int layerFrom;
Pair<Gradient, INDArray> currPair;
if (withOutputLayer) {
if (!(getOutputLayer() instanceof IOutputLayer)) {
log.warn("Warning: final layer isn't output layer. You cannot use backprop without an output layer.");
return null;
}
IOutputLayer outputLayer = (IOutputLayer) getOutputLayer();
if (labels == null)
throw new IllegalStateException("No labels found");
outputLayer.setLabels(labels);
currPair = outputLayer.backpropGradient(null);
for (Map.Entry<String, INDArray> entry : currPair.getFirst().gradientForVariable().entrySet()) {
String origName = entry.getKey();
multiGradientKey = String.valueOf(numLayers - 1) + "_" + origName;
gradientList.addLast(new Triple<>(multiGradientKey, entry.getValue(), currPair.getFirst().flatteningOrderForVariable(origName)));
}
if (getLayerWiseConfigurations().getInputPreProcess(numLayers - 1) != null)
currPair = new Pair<>(currPair.getFirst(), this.layerWiseConfigurations.getInputPreProcess(numLayers - 1).backprop(currPair.getSecond(), getInputMiniBatchSize()));
layerFrom = numLayers - 2;
} else {
currPair = new Pair<>(null, epsilon);
layerFrom = numLayers - 1;
}
// Calculate gradients for previous layers & drops output layer in count
for (int j = layerFrom; j >= 0; j--) {
currLayer = getLayer(j);
if (currLayer instanceof FrozenLayer)
break;
currPair = currLayer.backpropGradient(currPair.getSecond());
LinkedList<Triple<String, INDArray, Character>> tempList = new LinkedList<>();
for (Map.Entry<String, INDArray> entry : currPair.getFirst().gradientForVariable().entrySet()) {
String origName = entry.getKey();
multiGradientKey = String.valueOf(j) + "_" + origName;
tempList.addFirst(new Triple<>(multiGradientKey, entry.getValue(), currPair.getFirst().flatteningOrderForVariable(origName)));
}
for (Triple<String, INDArray, Character> triple : tempList) gradientList.addFirst(triple);
//Pass epsilon through input processor before passing to next layer (if applicable)
if (getLayerWiseConfigurations().getInputPreProcess(j) != null)
currPair = new Pair<>(currPair.getFirst(), getLayerWiseConfigurations().getInputPreProcess(j).backprop(currPair.getSecond(), getInputMiniBatchSize()));
}
//Add gradients to Gradients (map), in correct order
for (Triple<String, INDArray, Character> triple : gradientList) {
gradient.setGradientFor(triple.getFirst(), triple.getSecond(), triple.getThird());
}
return new Pair<>(gradient, currPair.getSecond());
}
Aggregations