Examples with CudaException - jcuda.CudaException

Example 1 with CudaException

use of jcuda.CudaException in project incubator-systemml by apache.

the class LibMatrixCUDA method maxpoolingBackward.

/**
	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
	 * This method computes the backpropogation errors for previous layer of maxpooling operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param image image as matrix object
	 * @param dout			delta matrix, output of previous layer
	 * @param outputBlock output matrix
	 * @param N				batch size
	 * @param C				number of channels
	 * @param H				height of image
	 * @param W				width of image
	 * @param K				number of filters
	 * @param R				height of filter
	 * @param S				width of filter
	 * @param pad_h			vertical padding
	 * @param pad_w			horizontal padding
	 * @param stride_h		horizontal stride
	 * @param stride_w		vertical stride
	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
    Pointer y = null;
    cudnnPoolingDescriptor poolingDesc = null;
    try {
        long t1 = 0, t2 = 0, t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // Allocate descriptors
        cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
        cudnnTensorDescriptor yDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
        cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, outputBlock, N, C, H, W);
        cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
        // Calling PoolForward first, y is one of the inputs for poolBackward
        // TODO: Remove calling poolForward after necessary changes at language level for poolBackward
        long numBytes = N * C * P * Q * Sizeof.DOUBLE;
        y = gCtx.allocate(numBytes);
        // Allocate data
        Pointer x = getDensePointer(gCtx, image, true, instName);
        Pointer dx = getDensePointer(gCtx, outputBlock, true, instName);
        Pointer dy = getDensePointer(gCtx, dout, true, instName);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        status = cudnnPoolingBackward(getCudnnHandle(gCtx), poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t4 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t4 = System.nanoTime();
        if (y != null)
            gCtx.cudaFreeHelper(instName, y);
        if (poolingDesc != null)
            cudnnDestroyPoolingDescriptor(poolingDesc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
    }
}

Also used : jcuda.jcudnn.cudnnPoolingDescriptor(jcuda.jcudnn.cudnnPoolingDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 2 with CudaException

use of jcuda.CudaException in project incubator-systemml by apache.

the class LibMatrixCUDA method performCuDNNReLU.

private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
    long t0 = 0;
    try {
        LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
        cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
        Pointer srcData = getDensePointer(gCtx, in, true, instName);
        cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
        cudnnCreateActivationDescriptor(activationDescriptor);
        double dummy = -1;
        cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor, one(), srcTensorDesc, srcData, zero(), dstTensorDesc, dstData);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t1 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
    }
}

Also used : jcuda.jcudnn.cudnnActivationDescriptor(jcuda.jcudnn.cudnnActivationDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 3 with CudaException

use of jcuda.CudaException in project incubator-systemml by apache.

the class LibMatrixCUDA method conv2dBackwardData.

/**
	 * This method computes the backpropogation errors for previous layer of convolution operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param filter filter used in conv2d
	 * @param dout errors from next layer
	 * @param output  output errors
	 * @param N number of images
	 * @param C number of channels
	 * @param H height
	 * @param W width
	 * @param K number of filters
	 * @param R filter height
	 * @param S filter width
	 * @param pad_h pad height
	 * @param pad_w pad width
	 * @param stride_h stride height
	 * @param stride_w stride width
	 * @param P output activation height
	 * @param Q output activation width
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
    cudnnFilterDescriptor wDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    try {
        long t1 = 0, t2 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // Allocate descriptors
        wDesc = allocateFilterDescriptor(K, C, R, S);
        cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
        cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, output, N, C, H, W);
        // Allocate data
        Pointer w = getDensePointer(gCtx, filter, true, instName);
        Pointer dy = getDensePointer(gCtx, dout, true, instName);
        Pointer dx = getDensePointer(gCtx, output, true, instName);
        int[] padding = { pad_h, pad_w };
        int[] strides = { stride_h, stride_w };
        convDesc = allocateConvolutionDescriptor(padding, strides);
        long[] sizeInBytesArray = { 0 };
        // TODO: Select the best algorithm depending on the data and supported CUDA
        int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
        workSpace = new Pointer();
        cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx), wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w, dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        if (workSpace != null && sizeInBytes != 0)
            gCtx.cudaFreeHelper(instName, workSpace);
        if (wDesc != null)
            cudnnDestroyFilterDescriptor(wDesc);
        if (convDesc != null)
            cudnnDestroyConvolutionDescriptor(convDesc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
    }
}

Also used : jcuda.jcudnn.cudnnConvolutionDescriptor(jcuda.jcudnn.cudnnConvolutionDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnFilterDescriptor(jcuda.jcudnn.cudnnFilterDescriptor) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 4 with CudaException

use of jcuda.CudaException in project incubator-systemml by apache.

the class LibMatrixCUDA method conv2d.

public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
    cudnnFilterDescriptor filterDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    try {
        long t1 = 0, t2 = 0;
        // Allocate descriptors
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
        cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
        filterDesc = allocateFilterDescriptor(K, C, R, S);
        int[] padding = { pad_h, pad_w };
        int[] strides = { stride_h, stride_w };
        convDesc = allocateConvolutionDescriptor(padding, strides);
        // Select the best algorithm depending on the data and supported CUDA
        int algo = -1;
        workSpace = new Pointer();
        if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
            algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
        } else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
            int[] algos = { jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM, jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM };
            // TODO: Look into FFt, Winograd, etc
            // Also ensure that GPU has enough memory to allocate memory
            long[] sizeInBytesArray = { 0 };
            algo = jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
            cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray);
            if (sizeInBytesArray[0] != 0)
                workSpace = gCtx.allocate(sizeInBytesArray[0]);
            sizeInBytes = sizeInBytesArray[0];
        } else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
            throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
        } else {
            throw new DMLRuntimeException("Unsupported preference criteria for convolution");
        }
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(), srcTensorDesc, image, filterDesc, filter, convDesc, algo, workSpace, sizeInBytes, zero(), dstTensorDesc, output);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
        if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        if (filterDesc != null)
            cudnnDestroyFilterDescriptor(filterDesc);
        if (convDesc != null)
            cudnnDestroyConvolutionDescriptor(convDesc);
        if (workSpace != null && sizeInBytes != 0)
            gCtx.cudaFreeHelper(instName, workSpace);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
    }
}

Example 5 with CudaException

use of jcuda.CudaException in project incubator-systemml by apache.

the class LibMatrixCUDA method conv2dBackwardFilter.

/**
	 * This method computes the backpropogation errors for filter of convolution operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param image input image
	 * @param dout errors from next layer
	 * @param outputBlock  output errors
	 * @param N number of images
	 * @param C number of channels
	 * @param H height
	 * @param W width
	 * @param K number of filters
	 * @param R filter height
	 * @param S filter width
	 * @param pad_h pad height
	 * @param pad_w pad width
	 * @param stride_h stride height
	 * @param stride_w stride width
	 * @param P output activation height
	 * @param Q output activation width
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
    cudnnFilterDescriptor dwDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    try {
        long t1 = 0, t2 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // Allocate descriptors
        cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
        cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
        dwDesc = allocateFilterDescriptor(K, C, R, S);
        // Allocate data
        Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
        Pointer doutPointer = getDensePointer(gCtx, dout, true, instName);
        Pointer dwPointer = getDensePointer(gCtx, outputBlock, true, instName);
        int[] padding = { pad_h, pad_w };
        int[] strides = { stride_h, stride_w };
        convDesc = allocateConvolutionDescriptor(padding, strides);
        long[] sizeInBytesArray = { 0 };
        // TODO: Select the best algorithm depending on the data and supported CUDA
        int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
        workSpace = new Pointer();
        cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx), xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer, doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        if (workSpace != null && sizeInBytes != 0)
            gCtx.cudaFreeHelper(instName, workSpace);
        if (dwDesc != null)
            cudnnDestroyFilterDescriptor(dwDesc);
        if (convDesc != null)
            cudnnDestroyConvolutionDescriptor(convDesc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
    }
}

Aggregations

CudaException (jcuda.CudaException)6 Pointer (jcuda.Pointer)6 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)6 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)6 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1