Search in sources :

Example 26 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method batchNormalizationForwardInference.

/**
	 * Performs the forward BatchNormalization layer computation for inference
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName name of the instruction
	 * @param image input image
	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
	 * @param ret normalized input
	 * @param epsilon epsilon value used in the batch normalization formula
	 * @throws DMLRuntimeException if error occurs
	 */
public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException {
    LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
    int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
    int N = (int) image.getNumRows();
    int C = (int) scale.getNumColumns();
    long CHW = image.getNumColumns();
    validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
    // Allocate descriptors
    cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[] { image }, new MatrixObject[] { ret });
    cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
    // Get underlying dense pointer
    Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
    Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
    Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
    Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
    Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
    Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
    checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, scaleTensorDesc, scalePtr, biasPtr, runningMeanPtr, runningVarPtr, epsilon));
}
Also used : jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Example 27 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method relu.

/**
	 * Performs the relu operation on the GPU.
	 * @param ec currently active {@link ExecutionContext}
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param in input matrix
	 * @param outputName	name of the output matrix
	 * @throws DMLRuntimeException	if an error occurs
	 */
public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    long N = in.getNumRows();
    long CHW = in.getNumColumns();
    MatrixObject output = ec.getMatrixObject(outputName);
    // Allocated the dense output matrix
    getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
    long t0 = 0;
    cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
    if (N * CHW >= numDoublesIn2GB || srcTensorDesc == null) {
        LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
        // Invokes relu(double* A,  double* ret, int rlen, int clen)
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        Pointer dstData = getDensePointer(gCtx, output, instName);
        // TODO: FIXME: Add sparse kernel support for relu
        Pointer srcData = getDensePointer(gCtx, in, instName);
        getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations((int) N, (int) CHW), srcData, dstData, (int) N, (int) CHW);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
    } else {
        performCuDNNReLU(gCtx, instName, in, getDensePointer(gCtx, output, true, instName), srcTensorDesc);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 28 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method unaryOp.

/**
	 * A helper function for all Unary ops (sqrt, abs, sin.. etc)
	 * @param ec valid execution context
	 * @param gCtx a valid {@link GPUContext}
	 * @param in1 input matrix
	 * @param kernel name of CUDA kernel for the unary op to execute
	 * @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block
	 * @param outputName output matrix name
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param kernelTimer the name of the timer to measure the kernel invocation
	 * @throws DMLRuntimeException
	 */
private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    GPUObject in = in1.getGPUObject(gCtx);
    boolean isSparseAndEmpty = in.isSparseAndEmpty();
    long t1 = 0;
    if (isSparseAndEmpty) {
        MatrixObject out = ec.getMatrixObject(outputName);
        ec.allocateGPUMatrixObject(outputName);
        out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
    } else {
        // Dense
        MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer output = getDensePointer(gCtx, out, instName);
        Pointer input = getDensePointer(gCtx, in1, instName);
        int size = (int) (in1.getNumColumns() * in1.getNumRows());
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) GPUObject(org.apache.sysml.runtime.instructions.gpu.context.GPUObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 29 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method conv2dBackwardFilter.

/**
	 * This method computes the backpropogation errors for filter of convolution operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param image input image
	 * @param dout errors from next layer
	 * @param outputBlock  output errors
	 * @param N number of images
	 * @param C number of channels
	 * @param H height
	 * @param W width
	 * @param K number of filters
	 * @param R filter height
	 * @param S filter width
	 * @param pad_h pad height
	 * @param pad_w pad width
	 * @param stride_h stride height
	 * @param stride_w stride width
	 * @param P output activation height
	 * @param Q output activation width
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
    cudnnFilterDescriptor dwDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    try {
        long t1 = 0, t2 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // Allocate descriptors
        cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
        cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
        dwDesc = allocateFilterDescriptor(K, C, R, S);
        // Allocate data
        Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
        Pointer doutPointer = getDensePointer(gCtx, dout, true, instName);
        Pointer dwPointer = getDensePointer(gCtx, outputBlock, true, instName);
        int[] padding = { pad_h, pad_w };
        int[] strides = { stride_h, stride_w };
        convDesc = allocateConvolutionDescriptor(padding, strides);
        long[] sizeInBytesArray = { 0 };
        // TODO: Select the best algorithm depending on the data and supported CUDA
        int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
        workSpace = new Pointer();
        cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx), xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer, doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        if (workSpace != null && sizeInBytes != 0)
            gCtx.cudaFreeHelper(instName, workSpace);
        if (dwDesc != null)
            cudnnDestroyFilterDescriptor(dwDesc);
        if (convDesc != null)
            cudnnDestroyConvolutionDescriptor(convDesc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
    }
}
Also used : jcuda.jcudnn.cudnnConvolutionDescriptor(jcuda.jcudnn.cudnnConvolutionDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnFilterDescriptor(jcuda.jcudnn.cudnnFilterDescriptor) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 30 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method sparseMatrixVectorMult.

/**
	 * Does a sparse matrix-vector multiply.
	 * C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName      the invoking instruction's name for record {@link Statistics}.
	 * @param output        allocated output object C to which the GPU output matrix will be attached
	 * @param isATranposed  if A is to be transposed or not (the op in op(A))
	 * @param m             number of rows in A (not op(A))
	 * @param n             number of cols in A (not op(A))
	 * @param k             number of rows in B, (cols in B is assumed to be 1)
	 * @param A             left sparse matrix on GPU
	 * @param B             right sparse vector on GPU
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k, CSRPointer A, CSRPointer B) throws DMLRuntimeException {
    long t0 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
    sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Aggregations

Pointer (jcuda.Pointer)44 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)33 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)26 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)11 CudaException (jcuda.CudaException)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 HashMap (java.util.HashMap)2 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 GPUObject (org.apache.sysml.runtime.instructions.gpu.context.GPUObject)2 LeftScalarOperator (org.apache.sysml.runtime.matrix.operators.LeftScalarOperator)2 RightScalarOperator (org.apache.sysml.runtime.matrix.operators.RightScalarOperator)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 CUfunction (jcuda.driver.CUfunction)1 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1 jcuda.jcusparse.cusparseMatDescr (jcuda.jcusparse.cusparseMatDescr)1 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)1 CM (org.apache.sysml.runtime.functionobjects.CM)1