Search in sources :

Example 1 with CSRPointer

use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.

the class LibMatrixCUDA method bothSparseMatmult.

/**
	 * Sparse C = Sparse op(A) * Sparse op(B)
	 * Reroutes call to sparse matrix-vector mult if needed
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param output ?
	 * @param instName name of the invoking instruction to record{@link Statistics}.
	 * @param left ?
	 * @param right ?
	 * @param isLeftTransposed ?
	 * @param isRightTransposed ?
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
    int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
    int n = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
    int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
    int k1 = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows());
    if (k != k1)
        throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
    if (m == -1 || n == -1 || k == -1)
        throw new DMLRuntimeException("Incorrect dimensions");
    CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    if (!isRightTransposed && right.getNumColumns() == 1) {
        // Matrix-Vector multiplication
        sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int) left.getNumRows(), (int) left.getNumColumns(), (int) right.getNumRows(), A, B);
    } else {
        // Matrix-Matrix multiplication
        sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
    }
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 2 with CSRPointer

use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.

the class LibMatrixCUDA method dgeam.

/**
	 * Performs sparse and dense dgeam given two input matrices
	 * C = alpha* op( A ) + beta* op ( B )
	 * where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
	 * To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
	 * @param ec execution context
	 * @param gCtx a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param in1 left input matrix
	 * @param in2 right input matrix
	 * @param outputName output variable name
	 * @param isLeftTransposed true if left matrix is transposed
	 * @param isRightTransposed true if right matrix is transposed
	 * @param alpha alpha
	 * @param beta beta
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
    Pointer alphaPtr = pointerTo(alpha);
    Pointer betaPtr = pointerTo(beta);
    int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
    int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
    MatrixObject out = ec.getMatrixObject(outputName);
    boolean isSparse1 = isInSparseFormat(gCtx, in1);
    boolean isSparse2 = isInSparseFormat(gCtx, in2);
    long t0 = 0, t1 = 0;
    // TODO: Implement sparse-dense matrix cublasDgeam kernel
    if (isSparse1 || isSparse2) {
        int m = (int) in1.getNumRows();
        int n = (int) in1.getNumColumns();
        // Perform sparse-sparse dgeam
        if (!isInSparseFormat(gCtx, in1)) {
            if (GPUStatistics.DISPLAY_STATISTICS)
                t0 = System.nanoTime();
            in1.getGPUObject(gCtx).denseToSparse();
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
        }
        CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        if (!isInSparseFormat(gCtx, in2)) {
            if (GPUStatistics.DISPLAY_STATISTICS)
                t0 = System.nanoTime();
            in2.getGPUObject(gCtx).denseToSparse();
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
        }
        CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        ec.allocateGPUMatrixObject(outputName);
        if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
            // Special case for transpose
            int nnz = (int) A.nnz;
            CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
            out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
            cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
        } else {
            // TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
            if (isLeftTransposed || isRightTransposed) {
                throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
            }
            if (GPUStatistics.DISPLAY_STATISTICS)
                t1 = System.nanoTime();
            CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
            out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
            //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
            if (GPUStatistics.DISPLAY_STATISTICS)
                t0 = System.nanoTime();
            JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
            //cudaDeviceSynchronize;
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
        }
    } else {
        // Dense-Dense dgeam
        int lda = (int) in1.getNumColumns();
        int ldb = (int) in2.getNumColumns();
        int m = (int) in1.getNumColumns();
        int n = (int) in2.getNumRows();
        if (isLeftTransposed && isRightTransposed) {
            m = (int) in1.getNumRows();
            n = (int) in2.getNumColumns();
        } else if (isLeftTransposed) {
            m = (int) in1.getNumRows();
        } else if (isRightTransposed) {
            n = (int) in2.getNumColumns();
        }
        int ldc = m;
        Pointer A = getDensePointer(gCtx, in1, instName);
        Pointer B = getDensePointer(gCtx, in2, instName);
        // Allocated the dense output matrix
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer C = getDensePointer(gCtx, out, instName);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        JCublas2.cublasDgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 3 with CSRPointer

use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.

the class LibMatrixCUDA method sparseSparseMatmult.

/**
	 * Does a sparse-sparse Matrix multiply
	 * C = op(A) x op(B), A, B are sparse matrices
	 * @param gCtx              a valid {@link GPUContext}
	 * @param instName          the invoking instruction's name for record {@link Statistics}.
	 * @param A                 left sparse matrix on GPU
	 * @param B                 right sparse matrix on GPU
	 * @param output            allocated output object on host to which the GPU output matrix will be attached
	 * @param isLeftTransposed  op for A - to be transposed or not
	 * @param isRightTransposed op for B
	 * @param m                 number of rows in op(A)
	 * @param n                 number of cols in op(B)
	 * @param k                 number of cols in op(A) or rows in op(B)
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
    LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
    int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
    int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
    long t0 = 0, t1 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
    output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
    //cudaDeviceSynchronize;
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)

Example 4 with CSRPointer

use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.

the class LibMatrixCUDA method denseSparseMatmult.

/**
	 * C = op(A) * op(B) where A is dense and B is sparse
	 * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
	 * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param left {@link MatrixObject} of A
	 * @param right {@link MatrixObject} of B
	 * @param output {@link MatrixObject} of the output matrix C
	 * @param isLeftTransposed whether matrix A needs to be transposed
	 * @param isRightTransposed whether matrix B needs to be transposed
	 * @param m ?
	 * @param n ?
	 * @param k ?
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
    // right sparse, left dense
    CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    Pointer ADense = getDensePointer(gCtx, left, instName);
    if (B.isUltraSparse(k, n)) {
        LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
        // Convert left to CSR and do cuSparse matmul
        int rowsA = (int) left.getNumRows();
        int colsA = (int) left.getNumColumns();
        long t0 = 0, t1 = 0, t2 = 0;
        if (DMLScript.STATISTICS)
            t0 = System.nanoTime();
        Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
        sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        A.deallocate();
        gCtx.cudaFreeHelper(AT);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
    } else {
        LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
        // Convert right to dense and do a cuBlas matmul
        // BDenseTransposed is a column major matrix
        // Note the arguments to denseDenseMatmult to accommodate for this.
        long t0 = 0, t1 = 0;
        if (DMLScript.STATISTICS)
            t0 = System.nanoTime();
        Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int) right.getNumRows(), (int) right.getNumColumns());
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // To allocate the dense matrix
        boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
        if (GPUStatistics.DISPLAY_STATISTICS && allocated)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
        Pointer C = getDensePointer(gCtx, output, instName);
        denseDenseMatmult(gCtx, instName, C, (int) left.getNumRows(), (int) left.getNumColumns(), (int) right.getNumColumns(), (int) right.getNumRows(), isLeftTransposed, !isRightTransposed, ADense, BDenseTransposed);
        gCtx.cudaFreeHelper(instName, BDenseTransposed);
    }
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)

Example 5 with CSRPointer

use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.

the class LibMatrixCUDA method sparseMatrixDenseVectorMult.

/**
	 * C = op(A) x B
	 * A is a sparse matrix, B is a dense vector
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param output	allocated output on the host, to which the GPU output C will be attached
	 * @param A			sparse matrix A on the GPU
	 * @param B_dense	dense matrix/vector B on the GPU
	 * @param isATranposed	op for A, tranposed or not
	 * @param m			number of rows in A (not op(A))
	 * @param k			number of cols in A or number of rows in B (not op(A) or op(B))
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, int m, int k) throws DMLRuntimeException {
    LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
    int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    long size = m * Sizeof.DOUBLE;
    if (isATranposed) {
        size = k * Sizeof.DOUBLE;
        transA = CUSPARSE_OPERATION_TRANSPOSE;
    }
    Pointer C_dense = gCtx.allocate(instName, (int) size);
    long t1 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int) A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
    //cudaDeviceSynchronize; 	// Since cusparseDcsrmv is asynchronously executed
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
    output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Aggregations

CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)7 Pointer (jcuda.Pointer)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)2 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)1