Search in sources :

Example 16 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project deeplearning4j by deeplearning4j.

the class CudnnSubsamplingHelper method activate.

@Override
public INDArray activate(INDArray input, boolean training, int[] kernel, int[] strides, int[] pad, PoolingType poolingType, ConvolutionMode convolutionMode) {
    int miniBatch = input.size(0);
    int inDepth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    int[] outSize;
    if (convolutionMode == ConvolutionMode.Same) {
        //Also performs validation
        outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, null, convolutionMode);
        pad = ConvolutionUtils.getSameModeTopLeftPadding(outSize, new int[] { input.size(2), input.size(3) }, kernel, strides);
    } else {
        //Also performs validation
        outSize = ConvolutionUtils.getOutputSize(input, kernel, strides, pad, convolutionMode);
    }
    int outH = outSize[0];
    int outW = outSize[1];
    int poolingMode;
    switch(poolingType) {
        case AVG:
            poolingMode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
            break;
        case MAX:
            poolingMode = CUDNN_POOLING_MAX;
            break;
        case NONE:
            return input;
        default:
            return null;
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    int[] srcStride = input.stride();
    checkCudnn(cudnnSetPooling2dDescriptor(cudnnContext.poolingDesc, poolingMode, CUDNN_PROPAGATE_NAN, kernel[0], kernel[1], pad[0], pad[1], strides[0], strides[1]));
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, inDepth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    reduced = Nd4j.createUninitialized(new int[] { miniBatch, inDepth, outH, outW }, 'c');
    int[] dstStride = reduced.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, inDepth, outH, outW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareAction(input, reduced);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer dstData = allocator.getPointer(reduced, context);
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(cudnnPoolingForward(cudnnContext, cudnnContext.poolingDesc, alpha, cudnnContext.srcTensorDesc, srcData, beta, cudnnContext.dstTensorDesc, dstData));
    allocator.registerAction(context, input, reduced);
    return reduced;
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer)

Example 17 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project deeplearning4j by deeplearning4j.

the class CudnnBatchNormalizationHelper method preOutput.

@Override
public INDArray preOutput(INDArray x, boolean training, int[] shape, INDArray gamma, INDArray beta, INDArray mean, INDArray var, double decay, double eps) {
    if (eps < CUDNN_BN_MIN_EPSILON) {
        throw new IllegalArgumentException("Error: eps < CUDNN_BN_MIN_EPSILON (" + eps + " < " + CUDNN_BN_MIN_EPSILON + ")");
    }
    int miniBatch = x.size(0);
    int inDepth = x.size(1);
    int inH = x.size(2);
    int inW = x.size(3);
    int[] srcStride = x.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, inDepth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    INDArray activations = Nd4j.createUninitialized(new int[] { miniBatch, inDepth, inH, inW }, 'c');
    int[] dstStride = activations.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, inDepth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    int[] gammaStride = gamma.stride();
    checkCudnn(cudnnSetTensor4dDescriptor(cudnnContext.gammaBetaTensorDesc, tensorFormat, dataType, shape[0], shape[1], shape.length > 2 ? shape[2] : 1, shape.length > 3 ? shape[3] : 1));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareActionAllWrite(x, activations, gamma, beta, mean, var);
    Pointer srcData = allocator.getPointer(x, context);
    Pointer dstData = allocator.getPointer(activations, context);
    Pointer gammaData = allocator.getPointer(gamma, context);
    Pointer betaData = allocator.getPointer(beta, context);
    Pointer meanData = allocator.getPointer(mean, context);
    Pointer varData = allocator.getPointer(var, context);
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    if (training) {
        if (meanCache.capacity() < mean.data().length() * mean.data().getElementSize()) {
            meanCache.deallocate();
            meanCache = new Cache(mean.data().length() * mean.data().getElementSize());
        }
        if (varCache.capacity() < var.data().length() * mean.data().getElementSize()) {
            varCache.deallocate();
            varCache = new Cache(var.data().length() * mean.data().getElementSize());
        }
        checkCudnn(cudnnBatchNormalizationForwardTraining(cudnnContext, batchNormMode, this.alpha, this.beta, cudnnContext.srcTensorDesc, srcData, cudnnContext.dstTensorDesc, dstData, cudnnContext.gammaBetaTensorDesc, gammaData, betaData, decay, meanData, varData, eps, meanCache, varCache));
    } else {
        checkCudnn(cudnnBatchNormalizationForwardInference(cudnnContext, batchNormMode, this.alpha, this.beta, cudnnContext.srcTensorDesc, srcData, cudnnContext.dstTensorDesc, dstData, cudnnContext.gammaBetaTensorDesc, gammaData, betaData, meanData, varData, eps));
    }
    allocator.getFlowController().registerActionAllWrite(context, x, activations, gamma, beta, mean, var);
    return activations;
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer)

Example 18 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project deeplearning4j by deeplearning4j.

the class CudnnLocalResponseNormalizationHelper method activate.

@Override
public INDArray activate(INDArray input, boolean training, double k, double n, double alpha, double beta) {
    if (n < CUDNN_LRN_MIN_N) {
        throw new IllegalArgumentException("Error: n < CUDNN_LRN_MIN_N (" + n + " < " + CUDNN_LRN_MIN_N + ")");
    }
    if (n > CUDNN_LRN_MAX_N) {
        throw new IllegalArgumentException("Error: n > CUDNN_LRN_MAX_N (" + n + " > " + CUDNN_LRN_MAX_N + ")");
    }
    if (k < CUDNN_LRN_MIN_K) {
        throw new IllegalArgumentException("Error: k < CUDNN_LRN_MIN_K (" + k + " < " + CUDNN_LRN_MIN_K + ")");
    }
    if (beta < CUDNN_LRN_MIN_BETA) {
        throw new IllegalArgumentException("Error: beta < CUDNN_LRN_MIN_BETA (" + beta + " < " + CUDNN_LRN_MIN_BETA + ")");
    }
    int miniBatch = input.size(0);
    int inDepth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    int[] srcStride = input.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, inDepth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    activations = Nd4j.createUninitialized(new int[] { miniBatch, inDepth, inH, inW }, 'c');
    int[] dstStride = activations.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, inDepth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    checkCudnn(cudnnSetLRNDescriptor(cudnnContext.lrnDesc, (int) n, alpha, beta, k));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareActionAllWrite(input, activations);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer dstData = allocator.getPointer(activations, context);
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(cudnnLRNCrossChannelForward(cudnnContext, cudnnContext.lrnDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, this.alpha, cudnnContext.srcTensorDesc, srcData, this.beta, cudnnContext.dstTensorDesc, dstData));
    allocator.getFlowController().registerActionAllWrite(context, input, activations);
    return activations;
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer)

Example 19 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project deeplearning4j by deeplearning4j.

the class CudnnLocalResponseNormalizationHelper method backpropGradient.

@Override
public Pair<Gradient, INDArray> backpropGradient(INDArray input, INDArray epsilon, double k, double n, double alpha, double beta) {
    if (n < CUDNN_LRN_MIN_N) {
        throw new IllegalArgumentException("Error: n < CUDNN_LRN_MIN_N (" + n + " < " + CUDNN_LRN_MIN_N + ")");
    }
    if (n > CUDNN_LRN_MAX_N) {
        throw new IllegalArgumentException("Error: n > CUDNN_LRN_MAX_N (" + n + " > " + CUDNN_LRN_MAX_N + ")");
    }
    if (k < CUDNN_LRN_MIN_K) {
        throw new IllegalArgumentException("Error: k < CUDNN_LRN_MIN_K (" + k + " < " + CUDNN_LRN_MIN_K + ")");
    }
    if (beta < CUDNN_LRN_MIN_BETA) {
        throw new IllegalArgumentException("Error: beta < CUDNN_LRN_MIN_BETA (" + beta + " < " + CUDNN_LRN_MIN_BETA + ")");
    }
    int miniBatch = input.size(0);
    int depth = input.size(1);
    int inH = input.size(2);
    int inW = input.size(3);
    Gradient retGradient = new DefaultGradient();
    if (!Shape.strideDescendingCAscendingF(epsilon)) {
        // apparently not supported by cuDNN
        epsilon = epsilon.dup();
    }
    int[] srcStride = input.stride();
    int[] deltaStride = epsilon.stride();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.srcTensorDesc, dataType, miniBatch, depth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3]));
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.deltaTensorDesc, dataType, miniBatch, depth, inH, inW, deltaStride[0], deltaStride[1], deltaStride[2], deltaStride[3]));
    checkCudnn(cudnnSetLRNDescriptor(cudnnContext.lrnDesc, (int) n, alpha, beta, k));
    INDArray nextEpsilon = Nd4j.createUninitialized(new int[] { miniBatch, depth, inH, inW }, 'c');
    int[] dstStride = nextEpsilon.stride();
    checkCudnn(cudnnSetTensor4dDescriptorEx(cudnnContext.dstTensorDesc, dataType, miniBatch, depth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3]));
    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareActionAllWrite(input, epsilon, activations, nextEpsilon);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer epsData = allocator.getPointer(epsilon, context);
    Pointer zData = allocator.getPointer(activations, context);
    Pointer dstData = allocator.getPointer(nextEpsilon, context);
    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(cudnnLRNCrossChannelBackward(cudnnContext, cudnnContext.lrnDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, this.alpha, cudnnContext.deltaTensorDesc, zData, cudnnContext.deltaTensorDesc, epsData, cudnnContext.srcTensorDesc, srcData, this.beta, cudnnContext.dstTensorDesc, dstData));
    allocator.getFlowController().registerActionAllWrite(context, input, epsilon, activations, nextEpsilon);
    return new Pair<>(retGradient, nextEpsilon);
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) Allocator(org.nd4j.jita.allocator.Allocator) Gradient(org.deeplearning4j.nn.gradient.Gradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) DefaultGradient(org.deeplearning4j.nn.gradient.DefaultGradient) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) ShortPointer(org.bytedeco.javacpp.ShortPointer) Pointer(org.bytedeco.javacpp.Pointer) Pair(org.deeplearning4j.berkeley.Pair)

Example 20 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project deeplearning4j by deeplearning4j.

the class FeedForwardWithKeyFunctionAdapter method call.

@Override
public Iterable<Tuple2<K, INDArray>> call(Iterator<Tuple2<K, INDArray>> iterator) throws Exception {
    if (!iterator.hasNext()) {
        return Collections.emptyList();
    }
    MultiLayerNetwork network = new MultiLayerNetwork(MultiLayerConfiguration.fromJson(jsonConfig.getValue()));
    network.init();
    INDArray val = params.value().unsafeDuplication();
    if (val.length() != network.numParams(false))
        throw new IllegalStateException("Network did not have same number of parameters as the broadcasted set parameters");
    network.setParameters(val);
    //Issue: for 2d data (MLPs etc) we can just stack the examples.
    //But: for 3d and 4d: in principle the data sizes could be different
    //We could handle that with mask arrays - but it gets messy. The approach used here is simpler but less efficient
    List<INDArray> featuresList = new ArrayList<>(batchSize);
    List<K> keyList = new ArrayList<>(batchSize);
    List<Integer> origSizeList = new ArrayList<>();
    int[] firstShape = null;
    boolean sizesDiffer = false;
    int tupleCount = 0;
    while (iterator.hasNext()) {
        Tuple2<K, INDArray> t2 = iterator.next();
        if (firstShape == null) {
            firstShape = t2._2().shape();
        } else if (!sizesDiffer) {
            for (int i = 1; i < firstShape.length; i++) {
                if (firstShape[i] != featuresList.get(tupleCount - 1).size(i)) {
                    sizesDiffer = true;
                    break;
                }
            }
        }
        featuresList.add(t2._2());
        keyList.add(t2._1());
        origSizeList.add(t2._2().size(0));
        tupleCount++;
    }
    if (tupleCount == 0) {
        return Collections.emptyList();
    }
    List<Tuple2<K, INDArray>> output = new ArrayList<>(tupleCount);
    int currentArrayIndex = 0;
    while (currentArrayIndex < featuresList.size()) {
        int firstIdx = currentArrayIndex;
        int nextIdx = currentArrayIndex;
        int examplesInBatch = 0;
        List<INDArray> toMerge = new ArrayList<>();
        firstShape = null;
        while (nextIdx < featuresList.size() && examplesInBatch < batchSize) {
            if (firstShape == null) {
                firstShape = featuresList.get(nextIdx).shape();
            } else if (sizesDiffer) {
                boolean breakWhile = false;
                for (int i = 1; i < firstShape.length; i++) {
                    if (firstShape[i] != featuresList.get(nextIdx).size(i)) {
                        //Next example has a different size. So: don't add it to the current batch, just process what we have
                        breakWhile = true;
                        break;
                    }
                }
                if (breakWhile) {
                    break;
                }
            }
            INDArray f = featuresList.get(nextIdx++);
            toMerge.add(f);
            examplesInBatch += f.size(0);
        }
        INDArray batchFeatures = Nd4j.concat(0, toMerge.toArray(new INDArray[toMerge.size()]));
        INDArray out = network.output(batchFeatures, false);
        examplesInBatch = 0;
        for (int i = firstIdx; i < nextIdx; i++) {
            int numExamples = origSizeList.get(i);
            INDArray outputSubset = getSubset(examplesInBatch, examplesInBatch + numExamples, out);
            examplesInBatch += numExamples;
            output.add(new Tuple2<>(keyList.get(i), outputSubset));
        }
        currentArrayIndex += (nextIdx - firstIdx);
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
    return output;
}
Also used : GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork)

Aggregations

GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)28 INDArray (org.nd4j.linalg.api.ndarray.INDArray)22 ArrayList (java.util.ArrayList)9 ComputationGraph (org.deeplearning4j.nn.graph.ComputationGraph)9 Allocator (org.nd4j.jita.allocator.Allocator)9 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)9 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)9 Tuple2 (scala.Tuple2)8 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)7 DoublePointer (org.bytedeco.javacpp.DoublePointer)6 FloatPointer (org.bytedeco.javacpp.FloatPointer)6 Pointer (org.bytedeco.javacpp.Pointer)6 ShortPointer (org.bytedeco.javacpp.ShortPointer)6 SparkTrainingStats (org.deeplearning4j.spark.api.stats.SparkTrainingStats)5 DataSet (org.nd4j.linalg.dataset.DataSet)5 Persistable (org.deeplearning4j.api.storage.Persistable)4 StorageMetaData (org.deeplearning4j.api.storage.StorageMetaData)4 Pair (org.deeplearning4j.berkeley.Pair)4 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)4 Gradient (org.deeplearning4j.nn.gradient.Gradient)4