use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method bothSparseMatmult.
/**
* Sparse C = Sparse op(A) * Sparse op(B)
* Reroutes call to sparse matrix-vector mult if needed
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output ?
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param left ?
* @param right ?
* @param isLeftTransposed ?
* @param isRightTransposed ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
int n = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
int k1 = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows());
if (k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if (m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
if (!isRightTransposed && right.getNumColumns() == 1) {
// Matrix-Vector multiplication
sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int) left.getNumRows(), (int) left.getNumColumns(), (int) right.getNumRows(), A, B);
} else {
// Matrix-Matrix multiplication
sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method dgeam.
/**
* Performs sparse and dense dgeam given two input matrices
* C = alpha* op( A ) + beta* op ( B )
* where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
* To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 left input matrix
* @param in2 right input matrix
* @param outputName output variable name
* @param isLeftTransposed true if left matrix is transposed
* @param isRightTransposed true if right matrix is transposed
* @param alpha alpha
* @param beta beta
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
Pointer alphaPtr = pointerTo(alpha);
Pointer betaPtr = pointerTo(beta);
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
MatrixObject out = ec.getMatrixObject(outputName);
boolean isSparse1 = isInSparseFormat(gCtx, in1);
boolean isSparse2 = isInSparseFormat(gCtx, in2);
long t0 = 0, t1 = 0;
// TODO: Implement sparse-dense matrix cublasDgeam kernel
if (isSparse1 || isSparse2) {
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
// Perform sparse-sparse dgeam
if (!isInSparseFormat(gCtx, in1)) {
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
in1.getGPUObject(gCtx).denseToSparse();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
}
CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
if (!isInSparseFormat(gCtx, in2)) {
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
in2.getGPUObject(gCtx).denseToSparse();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
}
CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
ec.allocateGPUMatrixObject(outputName);
if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
// Special case for transpose
int nnz = (int) A.nnz;
CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
} else {
// TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
if (isLeftTransposed || isRightTransposed) {
throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
}
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
//long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
}
} else {
// Dense-Dense dgeam
int lda = (int) in1.getNumColumns();
int ldb = (int) in2.getNumColumns();
int m = (int) in1.getNumColumns();
int n = (int) in2.getNumRows();
if (isLeftTransposed && isRightTransposed) {
m = (int) in1.getNumRows();
n = (int) in2.getNumColumns();
} else if (isLeftTransposed) {
m = (int) in1.getNumRows();
} else if (isRightTransposed) {
n = (int) in2.getNumColumns();
}
int ldc = m;
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer C = getDensePointer(gCtx, out, instName);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCublas2.cublasDgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseSparseMatmult.
/**
* Does a sparse-sparse Matrix multiply
* C = op(A) x op(B), A, B are sparse matrices
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param A left sparse matrix on GPU
* @param B right sparse matrix on GPU
* @param output allocated output object on host to which the GPU output matrix will be attached
* @param isLeftTransposed op for A - to be transposed or not
* @param isRightTransposed op for B
* @param m number of rows in op(A)
* @param n number of cols in op(B)
* @param k number of cols in op(A) or rows in op(B)
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
long t0 = 0, t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
}
use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method denseSparseMatmult.
/**
* C = op(A) * op(B) where A is dense and B is sparse
* If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
* otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param left {@link MatrixObject} of A
* @param right {@link MatrixObject} of B
* @param output {@link MatrixObject} of the output matrix C
* @param isLeftTransposed whether matrix A needs to be transposed
* @param isRightTransposed whether matrix B needs to be transposed
* @param m ?
* @param n ?
* @param k ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
// right sparse, left dense
CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer ADense = getDensePointer(gCtx, left, instName);
if (B.isUltraSparse(k, n)) {
LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
// Convert left to CSR and do cuSparse matmul
int rowsA = (int) left.getNumRows();
int colsA = (int) left.getNumColumns();
long t0 = 0, t1 = 0, t2 = 0;
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
A.deallocate();
gCtx.cudaFreeHelper(AT);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
// Convert right to dense and do a cuBlas matmul
// BDenseTransposed is a column major matrix
// Note the arguments to denseDenseMatmult to accommodate for this.
long t0 = 0, t1 = 0;
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int) right.getNumRows(), (int) right.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
// To allocate the dense matrix
boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
if (GPUStatistics.DISPLAY_STATISTICS && allocated)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
Pointer C = getDensePointer(gCtx, output, instName);
denseDenseMatmult(gCtx, instName, C, (int) left.getNumRows(), (int) left.getNumColumns(), (int) right.getNumColumns(), (int) right.getNumRows(), isLeftTransposed, !isRightTransposed, ADense, BDenseTransposed);
gCtx.cudaFreeHelper(instName, BDenseTransposed);
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseMatrixDenseVectorMult.
/**
* C = op(A) x B
* A is a sparse matrix, B is a dense vector
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output on the host, to which the GPU output C will be attached
* @param A sparse matrix A on the GPU
* @param B_dense dense matrix/vector B on the GPU
* @param isATranposed op for A, tranposed or not
* @param m number of rows in A (not op(A))
* @param k number of cols in A or number of rows in B (not op(A) or op(B))
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, int m, int k) throws DMLRuntimeException {
LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
long size = m * Sizeof.DOUBLE;
if (isATranposed) {
size = k * Sizeof.DOUBLE;
transA = CUSPARSE_OPERATION_TRANSPOSE;
}
Pointer C_dense = gCtx.allocate(instName, (int) size);
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int) A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
//cudaDeviceSynchronize; // Since cusparseDcsrmv is asynchronously executed
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
}
Aggregations