Search in sources :

Example 16 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method biasMultiply.

/**
	 * Performs the operation corresponding to the DML script:
	 * ones = matrix(1, rows=1, cols=Hout*Wout)
	 * output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
	 * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param input input image
	 * @param bias bias
	 * @param outputBlock output
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
    LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
    if (isInSparseFormat(gCtx, input)) {
        input.getGPUObject(gCtx).sparseToDense(instName);
    }
    if (isInSparseFormat(gCtx, bias)) {
        bias.getGPUObject(gCtx).sparseToDense(instName);
    }
    long rows = input.getNumRows();
    long cols = input.getNumColumns();
    long K = bias.getNumRows();
    long PQ = cols / K;
    if (bias.getNumColumns() != 1 || cols % K != 0) {
        throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
    }
    Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
    Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
    Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
    long t1 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations((int) rows, (int) cols), imagePointer, biasPointer, outputPointer, (int) rows, (int) cols, (int) PQ);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 17 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method conv2dBackwardData.

/**
	 * This method computes the backpropogation errors for previous layer of convolution operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param filter filter used in conv2d
	 * @param dout errors from next layer
	 * @param output  output errors
	 * @param N number of images
	 * @param C number of channels
	 * @param H height
	 * @param W width
	 * @param K number of filters
	 * @param R filter height
	 * @param S filter width
	 * @param pad_h pad height
	 * @param pad_w pad width
	 * @param stride_h stride height
	 * @param stride_w stride width
	 * @param P output activation height
	 * @param Q output activation width
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
    cudnnFilterDescriptor wDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    try {
        long t1 = 0, t2 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // Allocate descriptors
        wDesc = allocateFilterDescriptor(K, C, R, S);
        cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
        cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, output, N, C, H, W);
        // Allocate data
        Pointer w = getDensePointer(gCtx, filter, true, instName);
        Pointer dy = getDensePointer(gCtx, dout, true, instName);
        Pointer dx = getDensePointer(gCtx, output, true, instName);
        int[] padding = { pad_h, pad_w };
        int[] strides = { stride_h, stride_w };
        convDesc = allocateConvolutionDescriptor(padding, strides);
        long[] sizeInBytesArray = { 0 };
        // TODO: Select the best algorithm depending on the data and supported CUDA
        int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
        workSpace = new Pointer();
        cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx), wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w, dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        if (workSpace != null && sizeInBytes != 0)
            gCtx.cudaFreeHelper(instName, workSpace);
        if (wDesc != null)
            cudnnDestroyFilterDescriptor(wDesc);
        if (convDesc != null)
            cudnnDestroyConvolutionDescriptor(convDesc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
    }
}
Also used : jcuda.jcudnn.cudnnConvolutionDescriptor(jcuda.jcudnn.cudnnConvolutionDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnFilterDescriptor(jcuda.jcudnn.cudnnFilterDescriptor) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 18 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method sparseMatrixDenseVectorMult.

/**
	 * C = op(A) x B
	 * A is a sparse matrix, B is a dense vector
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param output	allocated output on the host, to which the GPU output C will be attached
	 * @param A			sparse matrix A on the GPU
	 * @param B_dense	dense matrix/vector B on the GPU
	 * @param isATranposed	op for A, tranposed or not
	 * @param m			number of rows in A (not op(A))
	 * @param k			number of cols in A or number of rows in B (not op(A) or op(B))
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, int m, int k) throws DMLRuntimeException {
    LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
    int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    long size = m * Sizeof.DOUBLE;
    if (isATranposed) {
        size = k * Sizeof.DOUBLE;
        transA = CUSPARSE_OPERATION_TRANSPOSE;
    }
    Pointer C_dense = gCtx.allocate(instName, (int) size);
    long t1 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int) A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
    //cudaDeviceSynchronize; 	// Since cusparseDcsrmv is asynchronously executed
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
    output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Example 19 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method denseDenseMatmult.

/**
	 * Dense-dense matrix multiply
	 * C = op(A) * op(B), A and B are dense matrices
	 * On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
	 * What we have as input is t(A) and t(B), t(X) = transpose of X.
	 * We do t(B) %*% t(A) to get t(C);
	 * If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
	 * What we really want is t(C). This we already have as the result of t(B) %*% t(A).
	 * @param gCtx               a valid {@link GPUContext}
	 * @param instName           name of the invoking instruction to record{@link Statistics}.
	 * @param output             output allocated on GPU in column major format
	 * @param leftRows1          number of rows in A
	 * @param leftCols1          number of cols in A
	 * @param rightRows1         number of rows in B
	 * @param rightCols1         number of cols in B
	 * @param isLeftTransposed1  op for A, transposed or not
	 * @param isRightTransposed1 op for B, transposed or not
	 * @param leftPtr            A allocated on the GPU in row-major format
	 * @param rightPtr           B allocated on the GPU in row-major format
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException {
    LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
    Pointer A = rightPtr;
    Pointer B = leftPtr;
    // To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects)
    int leftRows = rightCols1;
    int leftCols = rightRows1;
    int rightRows = leftCols1;
    int rightCols = leftRows1;
    boolean isLeftTransposed = isRightTransposed1;
    boolean isRightTransposed = isLeftTransposed1;
    // Note: the dimensions are swapped
    int m = isLeftTransposed ? leftCols : leftRows;
    int n = isRightTransposed ? rightRows : rightCols;
    int k = isLeftTransposed ? leftRows : leftCols;
    int k1 = isRightTransposed ? rightCols : rightRows;
    if (k != k1)
        throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
    if (m == -1 || n == -1 || k == -1)
        throw new DMLRuntimeException("Incorrect dimensions");
    double[] one = { 1 };
    double[] zero = { 0 };
    //int lda = leftRows;
    //int ldb = leftCols;
    int lda = isLeftTransposed ? k : m;
    int ldb = isRightTransposed ? n : k;
    int ldc = m;
    int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
    int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
    long t0 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    Pointer C = output;
    if (m == 1 && n == 1) {
        // Vector product
        LOG.debug(" GPU Dense-dense Vector Product");
        double[] result = { 0 };
        JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result));
        // By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
        // This means that scalar values passed are on host (as opposed to on device).
        // The result is copied from the host back to the device so that the rest of
        // infrastructure can treat it uniformly.
        cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
    } else if (m == 1) {
        // Vector-matrix multiply
        LOG.debug(" GPU Dense Vector-Matrix Multiply");
        transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
        JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
    } else if (n == 1) {
        // Matrix-vector multiply
        LOG.debug(" GPU Dense Matrix-Vector Multiply");
        JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
    } else {
        LOG.debug(" GPU Dense-Dense Matrix Multiply ");
        JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
    }
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 20 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method reduceAll.

/**
	 * Do a simple reduction, the output of which is a single value
	 * @param gCtx   a valid {@link GPUContext}
	 * @param kernelFunction 	name of the kernel function to invoke
	 * @param in							{@link Pointer} to matrix in device memory
	 * @param n								size of array
	 * @return	the reduced value
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
    LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
    int[] tmp = getKernelParamsForReduceAll(gCtx, n);
    int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
    Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
    long t1 = 0, t2 = 0, t3 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
    //cudaDeviceSynchronize;
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
    int s = blocks;
    while (s > 1) {
        tmp = getKernelParamsForReduceAll(gCtx, s);
        blocks = tmp[0];
        threads = tmp[1];
        sharedMem = tmp[2];
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
        s = (s + (threads * 2 - 1)) / (threads * 2);
    }
    double[] result = { -1f };
    if (GPUStatistics.DISPLAY_STATISTICS)
        t3 = System.nanoTime();
    cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
    gCtx.cudaFreeHelper(instName, tempOut);
    return result[0];
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) ExecutionConfig(org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig)

Aggregations

Pointer (jcuda.Pointer)44 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)33 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)26 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)11 CudaException (jcuda.CudaException)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 HashMap (java.util.HashMap)2 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 GPUObject (org.apache.sysml.runtime.instructions.gpu.context.GPUObject)2 LeftScalarOperator (org.apache.sysml.runtime.matrix.operators.LeftScalarOperator)2 RightScalarOperator (org.apache.sysml.runtime.matrix.operators.RightScalarOperator)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 CUfunction (jcuda.driver.CUfunction)1 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1 jcuda.jcusparse.cusparseMatDescr (jcuda.jcusparse.cusparseMatDescr)1 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)1 CM (org.apache.sysml.runtime.functionobjects.CM)1