Search in sources :

Example 1 with Level1

use of org.nd4j.linalg.api.blas.Level1 in project deeplearning4j by deeplearning4j.

the class BackTrackLineSearch method optimize.

// returns fraction of step size if found a good step
// returns 0.0 if could not step in direction
// step == alam and score == f in book
/**
     * @param parameters      the parameters to optimize
     * @param gradients       the line/rate of change
     * @param searchDirection the point for the line search to go in
     * @return the next step size
     * @throws InvalidStepException
     */
@Override
public double optimize(INDArray parameters, INDArray gradients, INDArray searchDirection) throws InvalidStepException {
    double test, stepMin, step, step2, oldStep, tmpStep;
    double rhs1, rhs2, a, b, disc, score, scoreAtStart, score2;
    minObjectiveFunction = (stepFunction instanceof NegativeDefaultStepFunction || stepFunction instanceof NegativeGradientStepFunction);
    Level1 l1Blas = Nd4j.getBlasWrapper().level1();
    double sum = l1Blas.nrm2(searchDirection);
    double slope = -1f * Nd4j.getBlasWrapper().dot(searchDirection, gradients);
    log.debug("slope = {}", slope);
    INDArray maxOldParams = abs(parameters);
    Nd4j.getExecutioner().exec(new ScalarSetValue(maxOldParams, 1));
    INDArray testMatrix = abs(gradients).divi(maxOldParams);
    test = testMatrix.max(Integer.MAX_VALUE).getDouble(0);
    // initially, step = 1.0, i.e. take full Newton step
    step = 1.0;
    // relative convergence tolerance
    stepMin = relTolx / test;
    oldStep = 0.0;
    step2 = 0.0;
    score = score2 = scoreAtStart = layer.score();
    double bestScore = score;
    double bestStepSize = 1.0;
    if (log.isTraceEnabled()) {
        double norm1 = l1Blas.asum(searchDirection);
        int infNormIdx = l1Blas.iamax(searchDirection);
        double infNorm = FastMath.max(Float.NEGATIVE_INFINITY, searchDirection.getDouble(infNormIdx));
        log.trace("ENTERING BACKTRACK\n");
        log.trace("Entering BackTrackLineSearch, value = " + scoreAtStart + ",\ndirection.oneNorm:" + norm1 + "  direction.infNorm:" + infNorm);
    }
    if (sum > stepMax) {
        log.warn("Attempted step too big. scaling: sum= {}, stepMax= {}", sum, stepMax);
        searchDirection.muli(stepMax / sum);
    }
    //        if (slope >= 0.0) {
    //            throw new InvalidStepException("Slope " + slope + " is >= 0.0. Expect slope < 0.0 when minimizing objective function");
    //        }
    // find maximum lambda
    // converge when (delta x) / x < REL_TOLX for all coordinates.
    // the largest step size that triggers this threshold is precomputed and saved in stepMin
    // look for step size in direction given by "line"
    INDArray candidateParameters = null;
    for (int iteration = 0; iteration < maxIterations; iteration++) {
        if (log.isTraceEnabled()) {
            log.trace("BackTrack loop iteration {} : step={}, oldStep={}", iteration, step, oldStep);
            log.trace("before step, x.1norm: {} \nstep: {} \noldStep: {}", parameters.norm1(Integer.MAX_VALUE), step, oldStep);
        }
        if (step == oldStep)
            throw new IllegalArgumentException("Current step == oldStep");
        // step
        candidateParameters = parameters.dup('f');
        stepFunction.step(candidateParameters, searchDirection, step);
        oldStep = step;
        if (log.isTraceEnabled()) {
            double norm1 = l1Blas.asum(candidateParameters);
            log.trace("after step, x.1norm: " + norm1);
        }
        // check for convergence on delta x
        if ((step < stepMin) || Nd4j.getExecutioner().execAndReturn(new Eps(parameters, candidateParameters, Shape.toOffsetZeroCopy(candidateParameters, 'f'), candidateParameters.length())).sum(Integer.MAX_VALUE).getDouble(0) == candidateParameters.length()) {
            score = setScoreFor(parameters);
            log.debug("EXITING BACKTRACK: Jump too small (stepMin = {}). Exiting and using original params. Score = {}", stepMin, score);
            return 0.0;
        }
        score = setScoreFor(candidateParameters);
        log.debug("Model score after step = {}", score);
        //Score best step size for use if we terminate on maxIterations
        if ((minObjectiveFunction && score < bestScore) || (!minObjectiveFunction && score > bestScore)) {
            bestScore = score;
            bestStepSize = step;
        }
        //Sufficient decrease in cost/loss function (Wolfe condition / Armijo condition)
        if (minObjectiveFunction && score <= scoreAtStart + ALF * step * slope) {
            log.debug("Sufficient decrease (Wolfe cond.), exiting backtrack on iter {}: score={}, scoreAtStart={}", iteration, score, scoreAtStart);
            if (score > scoreAtStart)
                throw new IllegalStateException("Function did not decrease: score = " + score + " > " + scoreAtStart + " = oldScore");
            return step;
        }
        //Sufficient increase in cost/loss function (Wolfe condition / Armijo condition)
        if (!minObjectiveFunction && score >= scoreAtStart + ALF * step * slope) {
            log.debug("Sufficient increase (Wolfe cond.), exiting backtrack on iter {}: score={}, bestScore={}", iteration, score, scoreAtStart);
            if (score < scoreAtStart)
                throw new IllegalStateException("Function did not increase: score = " + score + " < " + scoreAtStart + " = scoreAtStart");
            return step;
        } else // if value is infinite, i.e. we've jumped to unstable territory, then scale down jump
        if (Double.isInfinite(score) || Double.isInfinite(score2) || Double.isNaN(score) || Double.isNaN(score2)) {
            log.warn("Value is infinite after jump. oldStep={}. score={}, score2={}. Scaling back step size...", oldStep, score, score2);
            tmpStep = .2 * step;
            if (step < stepMin) {
                //convergence on delta x
                score = setScoreFor(parameters);
                log.warn("EXITING BACKTRACK: Jump too small (step={} < stepMin={}). Exiting and using previous parameters. Value={}", step, stepMin, score);
                return 0.0;
            }
        } else if (minObjectiveFunction) {
            if (// first time through
            step == 1.0)
                tmpStep = -slope / (2.0 * (score - scoreAtStart - slope));
            else {
                rhs1 = score - scoreAtStart - step * slope;
                rhs2 = score2 - scoreAtStart - step2 * slope;
                if (step == step2)
                    throw new IllegalStateException("FAILURE: dividing by step-step2 which equals 0. step=" + step);
                double stepSquared = step * step;
                double step2Squared = step2 * step2;
                a = (rhs1 / stepSquared - rhs2 / step2Squared) / (step - step2);
                b = (-step2 * rhs1 / stepSquared + step * rhs2 / step2Squared) / (step - step2);
                if (a == 0.0)
                    tmpStep = -slope / (2.0 * b);
                else {
                    disc = b * b - 3.0 * a * slope;
                    if (disc < 0.0) {
                        tmpStep = 0.5 * step;
                    } else if (b <= 0.0)
                        tmpStep = (-b + FastMath.sqrt(disc)) / (3.0 * a);
                    else
                        tmpStep = -slope / (b + FastMath.sqrt(disc));
                }
                if (tmpStep > 0.5 * step)
                    // lambda <= 0.5 lambda_1
                    tmpStep = 0.5 * step;
            }
        } else {
            if (// first time through
            step == 1.0)
                tmpStep = -slope / (2.0 * (scoreAtStart - score - slope));
            else {
                rhs1 = scoreAtStart - score - step * slope;
                rhs2 = scoreAtStart - score2 - step2 * slope;
                if (step == step2)
                    throw new IllegalStateException("FAILURE: dividing by step-step2 which equals 0. step=" + step);
                double stepSquared = step * step;
                double step2Squared = step2 * step2;
                a = (rhs1 / stepSquared - rhs2 / step2Squared) / (step - step2);
                b = (-step2 * rhs1 / stepSquared + step * rhs2 / step2Squared) / (step - step2);
                if (a == 0.0)
                    tmpStep = -slope / (2.0 * b);
                else {
                    disc = b * b - 3.0 * a * slope;
                    if (disc < 0.0) {
                        tmpStep = 0.5 * step;
                    } else if (b <= 0.0)
                        tmpStep = (-b + FastMath.sqrt(disc)) / (3.0 * a);
                    else
                        tmpStep = -slope / (b + FastMath.sqrt(disc));
                }
                if (tmpStep > 0.5 * step)
                    // lambda <= 0.5 lambda_1
                    tmpStep = 0.5 * step;
            }
        }
        step2 = step;
        score2 = score;
        log.debug("tmpStep: {}", tmpStep);
        // lambda >= .1*Lambda_1
        step = Math.max(tmpStep, .1f * step);
    }
    if (minObjectiveFunction && bestScore < scoreAtStart) {
        //Return best step size
        log.debug("Exited line search after maxIterations termination condition; bestStepSize={}, bestScore={}, scoreAtStart={}", bestStepSize, bestScore, scoreAtStart);
        return bestStepSize;
    } else if (!minObjectiveFunction && bestScore > scoreAtStart) {
        //Return best step size
        log.debug("Exited line search after maxIterations termination condition; bestStepSize={}, bestScore={}, scoreAtStart={}", bestStepSize, bestScore, scoreAtStart);
        return bestStepSize;
    } else {
        log.debug("Exited line search after maxIterations termination condition; score did not improve (bestScore={}, scoreAtStart={}). Resetting parameters", bestScore, scoreAtStart);
        setScoreFor(parameters);
        return 0.0;
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) ScalarSetValue(org.nd4j.linalg.api.ops.impl.scalar.comparison.ScalarSetValue) Eps(org.nd4j.linalg.api.ops.impl.transforms.comparison.Eps) Level1(org.nd4j.linalg.api.blas.Level1) NegativeDefaultStepFunction(org.deeplearning4j.optimize.stepfunctions.NegativeDefaultStepFunction) NegativeGradientStepFunction(org.deeplearning4j.nn.conf.stepfunctions.NegativeGradientStepFunction)

Example 2 with Level1

use of org.nd4j.linalg.api.blas.Level1 in project deeplearning4j by deeplearning4j.

the class InMemoryGraphLookupTable method iterate.

@Override
public void iterate(int first, int second) {
    //Get vectors and gradients
    //vecAndGrads[0][0] is vector of vertex(first); vecAndGrads[1][0] is corresponding gradient
    INDArray[][] vecAndGrads = vectorsAndGradients(first, second);
    Level1 l1 = Nd4j.getBlasWrapper().level1();
    for (int i = 0; i < vecAndGrads[0].length; i++) {
        //Update: v = v - lr * gradient
        l1.axpy(vecAndGrads[0][i].length(), -learningRate, vecAndGrads[1][i], vecAndGrads[0][i]);
    }
}
Also used : Level1(org.nd4j.linalg.api.blas.Level1)

Example 3 with Level1

use of org.nd4j.linalg.api.blas.Level1 in project deeplearning4j by deeplearning4j.

the class LSTMHelpers method backpropGradientHelper.

public static Pair<Gradient, INDArray> backpropGradientHelper(final NeuralNetConfiguration conf, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final IActivation gateActivationFn, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray input, //Shape: [hiddenLayerSize,4*hiddenLayerSize+3]; order: [wI,wF,wO,wG,wFF,wOO,wGG]
final INDArray recurrentWeights, //Shape: [n^(L-1),4*hiddenLayerSize]; order: [wi,wf,wo,wg]
final INDArray inputWeights, final INDArray epsilon, final boolean truncatedBPTT, final int tbpttBackwardLength, final FwdPassReturn fwdPass, final boolean forwards, final String inputWeightKey, final String recurrentWeightKey, final String biasWeightKey, //Input mask: should only be used with bidirectional RNNs + variable length
final Map<String, INDArray> gradientViews, //Input mask: should only be used with bidirectional RNNs + variable length
INDArray maskArray) {
    //Expect errors to have shape: [miniBatchSize,n^(L+1),timeSeriesLength]
    //i.e., n^L
    int hiddenLayerSize = recurrentWeights.size(0);
    //n^(L-1)
    int prevLayerSize = inputWeights.size(0);
    int miniBatchSize = epsilon.size(0);
    //Edge case: T=1 may have shape [miniBatchSize,n^(L+1)], equiv. to [miniBatchSize,n^(L+1),1]
    boolean is2dInput = epsilon.rank() < 3;
    int timeSeriesLength = (is2dInput ? 1 : epsilon.size(2));
    INDArray wFFTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize)).transpose();
    INDArray wOOTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize + 1)).transpose();
    INDArray wGGTranspose = recurrentWeights.get(NDArrayIndex.all(), point(4 * hiddenLayerSize + 2)).transpose();
    INDArray wIFOG = recurrentWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize));
    //F order here so that content for time steps are together
    //i.e., what would be W^L*(delta^L)^T. Shape: [m,n^(L-1),T]
    INDArray epsilonNext = Nd4j.create(new int[] { miniBatchSize, prevLayerSize, timeSeriesLength }, 'f');
    INDArray nablaCellStateNext = null;
    INDArray deltaifogNext = Nd4j.create(new int[] { miniBatchSize, 4 * hiddenLayerSize }, 'f');
    INDArray deltaiNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
    INDArray deltafNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(hiddenLayerSize, 2 * hiddenLayerSize));
    INDArray deltaoNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 3 * hiddenLayerSize));
    INDArray deltagNext = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(3 * hiddenLayerSize, 4 * hiddenLayerSize));
    Level1 l1BLAS = Nd4j.getBlasWrapper().level1();
    int endIdx = 0;
    if (truncatedBPTT) {
        endIdx = Math.max(0, timeSeriesLength - tbpttBackwardLength);
    }
    //Get gradients. Note that we have to manually zero these, as they might not be initialized (or still has data from last iteration)
    //Also note that they are in f order (as per param initializer) so can be used in gemm etc
    INDArray iwGradientsOut = gradientViews.get(inputWeightKey);
    //Order: {I,F,O,G,FF,OO,GG}
    INDArray rwGradientsOut = gradientViews.get(recurrentWeightKey);
    INDArray bGradientsOut = gradientViews.get(biasWeightKey);
    iwGradientsOut.assign(0);
    rwGradientsOut.assign(0);
    bGradientsOut.assign(0);
    INDArray rwGradientsIFOG = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(0, 4 * hiddenLayerSize));
    INDArray rwGradientsFF = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize));
    INDArray rwGradientsOO = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize + 1));
    INDArray rwGradientsGG = rwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.point(4 * hiddenLayerSize + 2));
    boolean sigmoidGates = gateActivationFn instanceof ActivationSigmoid;
    IActivation afn = conf.getLayer().getActivationFn();
    INDArray timeStepMaskColumn = null;
    for (int iTimeIndex = timeSeriesLength - 1; iTimeIndex >= endIdx; iTimeIndex--) {
        int time = iTimeIndex;
        int inext = 1;
        if (!forwards) {
            time = timeSeriesLength - iTimeIndex - 1;
            inext = -1;
        }
        //First: calclate the components of nablaCellState that relies on the next time step deltas, so we can overwrite the deltas
        INDArray nablaCellState;
        if (iTimeIndex != timeSeriesLength - 1) {
            nablaCellState = deltafNext.dup('f').muliRowVector(wFFTranspose);
            l1BLAS.axpy(nablaCellState.length(), 1.0, deltagNext.dup('f').muliRowVector(wGGTranspose), nablaCellState);
        } else {
            nablaCellState = Nd4j.create(new int[] { miniBatchSize, hiddenLayerSize }, 'f');
        }
        INDArray prevMemCellState = (iTimeIndex == 0 ? null : fwdPass.memCellState[time - inext]);
        INDArray prevHiddenUnitActivation = (iTimeIndex == 0 ? null : fwdPass.fwdPassOutputAsArrays[time - inext]);
        INDArray currMemCellState = fwdPass.memCellState[time];
        //LSTM unit output errors (dL/d(a_out)); not to be confused with \delta=dL/d(z_out)
        //(w^{L+1}*(delta^{(L+1)t})^T)^T or equiv.
        INDArray epsilonSlice = (is2dInput ? epsilon : epsilon.tensorAlongDimension(time, 1, 0));
        //Shape: [m,n^L]
        INDArray nablaOut = Shape.toOffsetZeroCopy(epsilonSlice, 'f');
        if (iTimeIndex != timeSeriesLength - 1) {
            //if t == timeSeriesLength-1 then deltaiNext etc are zeros
            Nd4j.gemm(deltaifogNext, wIFOG, nablaOut, false, true, 1.0, 1.0);
        }
        //Output gate deltas:
        INDArray sigmahOfS = fwdPass.memCellActivations[time];
        INDArray ao = fwdPass.oa[time];
        //Normally would use zo.dup() in above line, but won't be using zo again (for this time step). Ditto for zf, zg, zi
        INDArray deltao = deltaoNext;
        Nd4j.getExecutioner().exec(new MulOp(nablaOut, sigmahOfS, deltao));
        if (sigmoidGates) {
            //Equivalent to sigmoid deriv on zo
            INDArray sigmaoPrimeOfZo = Nd4j.getExecutioner().execAndReturn(new TimesOneMinus(ao.dup('f')));
            deltao.muli(sigmaoPrimeOfZo);
        } else {
            //Deltao needs to be modified in-place
            deltao.assign(gateActivationFn.backprop(fwdPass.oz[time], deltao).getFirst());
        //TODO: optimize (no assign)
        }
        //Memory cell error:
        //TODO activation functions with params
        INDArray temp = afn.backprop(currMemCellState.dup('f'), ao.muli(nablaOut)).getFirst();
        l1BLAS.axpy(nablaCellState.length(), 1.0, temp, nablaCellState);
        INDArray deltaMulRowWOO = deltao.dup('f').muliRowVector(wOOTranspose);
        //nablaCellState.addi(deltao.mulRowVector(wOOTranspose));
        l1BLAS.axpy(nablaCellState.length(), 1.0, deltaMulRowWOO, nablaCellState);
        if (iTimeIndex != timeSeriesLength - 1) {
            INDArray nextForgetGateAs = fwdPass.fa[time + inext];
            int length = nablaCellState.length();
            //nablaCellState.addi(nextForgetGateAs.mul(nablaCellStateNext))
            l1BLAS.axpy(length, 1.0, nextForgetGateAs.muli(nablaCellStateNext), nablaCellState);
        }
        //Store for use in next iteration
        nablaCellStateNext = nablaCellState;
        //Forget gate delta:
        INDArray af = fwdPass.fa[time];
        INDArray deltaf = null;
        if (iTimeIndex > 0) {
            deltaf = deltafNext;
            if (sigmoidGates) {
                Nd4j.getExecutioner().exec(new TimesOneMinus(af, deltaf));
                deltaf.muli(nablaCellState);
                deltaf.muli(prevMemCellState);
            } else {
                INDArray temp2 = nablaCellState.mul(prevMemCellState);
                //deltaf needs to be modified in-place
                deltaf.assign(gateActivationFn.backprop(fwdPass.fz[time].dup('f'), temp2).getFirst());
            //TODO activation functions with params
            }
        }
        //Shape: [m,n^L]
        //Input modulation gate delta:
        INDArray ag = fwdPass.ga[time];
        INDArray ai = fwdPass.ia[time];
        INDArray deltag = deltagNext;
        if (sigmoidGates) {
            //Equivalent to sigmoid deriv on zg
            Nd4j.getExecutioner().exec(new TimesOneMinus(ag, deltag));
            deltag.muli(ai);
            deltag.muli(nablaCellState);
        } else {
            INDArray temp2 = Nd4j.getExecutioner().execAndReturn(new MulOp(ai, nablaCellState, Nd4j.createUninitialized(ai.shape(), 'f')));
            deltag.assign(gateActivationFn.backprop(fwdPass.gz[time], temp2).getFirst());
        //TODO activation functions with params; optimize (no assign)
        }
        //Shape: [m,n^L]
        //Network input delta:
        INDArray zi = fwdPass.iz[time];
        INDArray deltai = deltaiNext;
        temp = Nd4j.getExecutioner().execAndReturn(new MulOp(ag, nablaCellState, Nd4j.createUninitialized(deltai.shape(), 'f')));
        deltai.assign(afn.backprop(zi, temp).getFirst());
        //Handle masking
        if (maskArray != null) {
            //Mask array is present: bidirectional RNN -> need to zero out these errors to avoid using errors from a masked time step
            // to calculate the parameter gradients.  Mask array has shape [minibatch, timeSeriesLength] -> get column(this time step)
            timeStepMaskColumn = maskArray.getColumn(time);
            deltaifogNext.muliColumnVector(timeStepMaskColumn);
        //Later, the deltaifogNext is used to calculate: input weight gradients, recurrent weight gradients, bias gradients
        }
        INDArray prevLayerActivationSlice = Shape.toMmulCompatible(is2dInput ? input : input.tensorAlongDimension(time, 1, 0));
        if (iTimeIndex > 0) {
            //Again, deltaifog_current == deltaifogNext at this point... same array
            Nd4j.gemm(prevLayerActivationSlice, deltaifogNext, iwGradientsOut, true, false, 1.0, 1.0);
        } else {
            INDArray iwGradients_i = iwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
            Nd4j.gemm(prevLayerActivationSlice, deltai, iwGradients_i, true, false, 1.0, 1.0);
            INDArray iwGradients_og = iwGradientsOut.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            INDArray deltaog = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            Nd4j.gemm(prevLayerActivationSlice, deltaog, iwGradients_og, true, false, 1.0, 1.0);
        }
        if (iTimeIndex > 0) {
            //If t==0, then prevHiddenUnitActivation==zeros(n^L,n^L), so dL/dW for recurrent weights will end up as 0 anyway
            //At this point: deltaifog and deltaifogNext are the same thing...
            //So what we are actually doing here is sum of (prevAct^transpose * deltaifog_current)
            Nd4j.gemm(prevHiddenUnitActivation, deltaifogNext, rwGradientsIFOG, true, false, 1.0, 1.0);
            //Shape: [1,n^L]. sum(0) is sum over examples in mini-batch.
            //Can use axpy here because result of sum and rwGradients[4 to 6] have order Nd4j.order(), via Nd4j.create()
            //mul not mmul because these weights are from unit j->j only (whereas other recurrent weights are i->j for all i,j)
            INDArray dLdwFF = deltaf.dup('f').muli(prevMemCellState).sum(0);
            //rwGradients[4].addi(dLdwFF);    //dL/dw_{FF}
            l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwFF, rwGradientsFF);
            INDArray dLdwGG = deltag.dup('f').muli(prevMemCellState).sum(0);
            //rwGradients[6].addi(dLdwGG);
            l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwGG, rwGradientsGG);
        }
        //Expected shape: [n^L,1]. sum(0) is sum over examples in mini-batch.
        INDArray dLdwOO = deltao.dup('f').muli(currMemCellState).sum(0);
        //rwGradients[5].addi(dLdwOO);    //dL/dw_{OOxy}
        l1BLAS.axpy(hiddenLayerSize, 1.0, dLdwOO, rwGradientsOO);
        if (iTimeIndex > 0) {
            l1BLAS.axpy(4 * hiddenLayerSize, 1.0, deltaifogNext.sum(0), bGradientsOut);
        } else {
            //Sneaky way to do bGradients_i += deltai.sum(0)
            l1BLAS.axpy(hiddenLayerSize, 1.0, deltai.sum(0), bGradientsOut);
            INDArray ogBiasToAdd = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize)).sum(0);
            INDArray ogBiasGrad = bGradientsOut.get(NDArrayIndex.point(0), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            l1BLAS.axpy(2 * hiddenLayerSize, 1.0, ogBiasToAdd, ogBiasGrad);
        }
        //Calculate epsilonNext - i.e., equiv. to what would be (w^L*(d^(Lt))^T)^T in a normal network
        //But here, need to add 4 weights * deltas for the IFOG gates
        //This slice: f order and contiguous, due to epsilonNext being defined as f order.
        INDArray epsilonNextSlice = epsilonNext.tensorAlongDimension(time, 1, 0);
        if (iTimeIndex > 0) {
            Nd4j.gemm(deltaifogNext, inputWeights, epsilonNextSlice, false, true, 1.0, 1.0);
        } else {
            //No contribution from forget gate at t=0
            INDArray wi = inputWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(0, hiddenLayerSize));
            Nd4j.gemm(deltai, wi, epsilonNextSlice, false, true, 1.0, 1.0);
            INDArray deltaog = deltaifogNext.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            INDArray wog = inputWeights.get(NDArrayIndex.all(), NDArrayIndex.interval(2 * hiddenLayerSize, 4 * hiddenLayerSize));
            //epsilonNextSlice.addi(deltao.mmul(woTranspose)).addi(deltag.mmul(wgTranspose));
            Nd4j.gemm(deltaog, wog, epsilonNextSlice, false, true, 1.0, 1.0);
        }
        if (maskArray != null) {
            //Mask array is present: bidirectional RNN -> need to zero out these errors to avoid sending anything
            // but 0s to the layer below at this time step (for the given example)
            epsilonNextSlice.muliColumnVector(timeStepMaskColumn);
        }
    }
    Gradient retGradient = new DefaultGradient();
    retGradient.gradientForVariable().put(inputWeightKey, iwGradientsOut);
    retGradient.gradientForVariable().put(recurrentWeightKey, rwGradientsOut);
    retGradient.gradientForVariable().put(biasWeightKey, bGradientsOut);
    return new Pair<>(retGradient, epsilonNext);
}
Also used : Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) INDArray(org.nd4j.linalg.api.ndarray.INDArray) TimesOneMinus(org.nd4j.linalg.api.ops.impl.transforms.TimesOneMinus) MulOp(org.nd4j.linalg.api.ops.impl.transforms.arithmetic.MulOp) ActivationSigmoid(org.nd4j.linalg.activations.impl.ActivationSigmoid) Level1(org.nd4j.linalg.api.blas.Level1) IActivation(org.nd4j.linalg.activations.IActivation) NDArrayIndex.point(org.nd4j.linalg.indexing.NDArrayIndex.point) Pair(org.deeplearning4j.berkeley.Pair)

Example 4 with Level1

use of org.nd4j.linalg.api.blas.Level1 in project deeplearning4j by deeplearning4j.

the class GraphVectorsImpl method verticesNearest.

@Override
public int[] verticesNearest(int vertexIdx, int top) {
    INDArray vec = lookupTable.getVector(vertexIdx).dup();
    double norm2 = vec.norm2Number().doubleValue();
    PriorityQueue<Pair<Double, Integer>> pq = new PriorityQueue<>(lookupTable.getNumVertices(), new PairComparator());
    Level1 l1 = Nd4j.getBlasWrapper().level1();
    for (int i = 0; i < numVertices(); i++) {
        if (i == vertexIdx)
            continue;
        INDArray other = lookupTable.getVector(i);
        double cosineSim = l1.dot(vec.length(), 1.0, vec, other) / (norm2 * other.norm2Number().doubleValue());
        pq.add(new Pair<>(cosineSim, i));
    }
    int[] out = new int[top];
    for (int i = 0; i < top; i++) {
        out[i] = pq.remove().getSecond();
    }
    return out;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) Level1(org.nd4j.linalg.api.blas.Level1) PriorityQueue(java.util.PriorityQueue) Pair(org.deeplearning4j.berkeley.Pair)

Example 5 with Level1

use of org.nd4j.linalg.api.blas.Level1 in project deeplearning4j by deeplearning4j.

the class InMemoryGraphLookupTable method vectorsAndGradients.

/** Returns vertex vector and vector gradients, plus inner node vectors and inner node gradients<br>
     * Specifically, out[0] are vectors, out[1] are gradients for the corresponding vectors<br>
     * out[0][0] is vector for first vertex; out[0][1] is gradient for this vertex vector<br>
     * out[0][i] (i>0) is the inner node vector along path to second vertex; out[1][i] is gradient for inner node vertex<br>
     * This design is used primarily to aid in testing (numerical gradient checks)
     * @param first first (input) vertex index
     * @param second second (output) vertex index
     */
public INDArray[][] vectorsAndGradients(int first, int second) {
    //Input vertex vector gradients are composed of the inner node gradients
    //Get vector for first vertex, as well as code for second:
    INDArray vec = vertexVectors.getRow(first);
    int codeLength = tree.getCodeLength(second);
    long code = tree.getCode(second);
    int[] innerNodesForVertex = tree.getPathInnerNodes(second);
    INDArray[][] out = new INDArray[2][innerNodesForVertex.length + 1];
    Level1 l1 = Nd4j.getBlasWrapper().level1();
    INDArray accumError = Nd4j.create(vec.shape());
    for (int i = 0; i < codeLength; i++) {
        //Inner node:
        int innerNodeIdx = innerNodesForVertex[i];
        //left or right?
        boolean path = getBit(code, i);
        INDArray innerNodeVector = outWeights.getRow(innerNodeIdx);
        double sigmoidDot = sigmoid(Nd4j.getBlasWrapper().dot(innerNodeVector, vec));
        //Calculate gradient for inner node + accumulate error:
        INDArray innerNodeGrad;
        if (path) {
            innerNodeGrad = vec.mul(sigmoidDot - 1);
            l1.axpy(vec.length(), sigmoidDot - 1, innerNodeVector, accumError);
        } else {
            innerNodeGrad = vec.mul(sigmoidDot);
            l1.axpy(vec.length(), sigmoidDot, innerNodeVector, accumError);
        }
        out[0][i + 1] = innerNodeVector;
        out[1][i + 1] = innerNodeGrad;
    }
    out[0][0] = vec;
    out[1][0] = accumError;
    return out;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) Level1(org.nd4j.linalg.api.blas.Level1)

Aggregations

Level1 (org.nd4j.linalg.api.blas.Level1)7 INDArray (org.nd4j.linalg.api.ndarray.INDArray)6 IActivation (org.nd4j.linalg.activations.IActivation)3 Pair (org.deeplearning4j.berkeley.Pair)2 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)2 Gradient (org.deeplearning4j.nn.gradient.Gradient)2 ActivationSigmoid (org.nd4j.linalg.activations.impl.ActivationSigmoid)2 NDArrayIndex.point (org.nd4j.linalg.indexing.NDArrayIndex.point)2 PriorityQueue (java.util.PriorityQueue)1 DL4JInvalidInputException (org.deeplearning4j.exception.DL4JInvalidInputException)1 NegativeGradientStepFunction (org.deeplearning4j.nn.conf.stepfunctions.NegativeGradientStepFunction)1 TrainingListener (org.deeplearning4j.optimize.api.TrainingListener)1 NegativeDefaultStepFunction (org.deeplearning4j.optimize.stepfunctions.NegativeDefaultStepFunction)1 ScalarSetValue (org.nd4j.linalg.api.ops.impl.scalar.comparison.ScalarSetValue)1 TimesOneMinus (org.nd4j.linalg.api.ops.impl.transforms.TimesOneMinus)1 MulOp (org.nd4j.linalg.api.ops.impl.transforms.arithmetic.MulOp)1 Eps (org.nd4j.linalg.api.ops.impl.transforms.comparison.Eps)1