use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseMatrixVectorMult.
/**
* Does a sparse matrix-vector multiply.
* C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output object C to which the GPU output matrix will be attached
* @param isATranposed if A is to be transposed or not (the op in op(A))
* @param m number of rows in A (not op(A))
* @param n number of cols in A (not op(A))
* @param k number of rows in B, (cols in B is assumed to be 1)
* @param A left sparse matrix on GPU
* @param B right sparse vector on GPU
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k, CSRPointer A, CSRPointer B) throws DMLRuntimeException {
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
}
use of org.apache.sysml.runtime.instructions.gpu.context.CSRPointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseDenseMatmult.
/**
* * C = op(A) * op(B) where A is sparse and B is dense
* If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
* otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output the output matrix object
* @param left matrix A
* @param right matrix B
* @param isLeftTransposed if A needs to be transposed
* @param isRightTransposed if B needs to be transposed
* @param m ?
* @param n ?
* @param k ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer BDense = getDensePointer(gCtx, right, instName);
if (n == 1) {
// Sparse Matrix - Dense Vector multiply
sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int) left.getNumRows(), (int) left.getNumColumns());
} else {
long t0 = 0, t1 = 0, t2 = 0;
// Sparse Matrix Dense Matrix multiply
if (A.isUltraSparse(m, k)) {
LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx);
// Convert right to CSR and do cuSparse matmul
int rowsB = (int) right.getNumRows();
int colsB = (int) right.getNumColumns();
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
B.deallocate();
gCtx.cudaFreeHelper(BT);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
// Note the arguments to denseDenseMatmult to accommodate for this.
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int) left.getNumRows(), (int) left.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
// To allocate the dense matrix
boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
if (allocated && GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
Pointer C = getDensePointer(gCtx, output, instName);
denseDenseMatmult(gCtx, instName, C, (int) left.getNumColumns(), (int) left.getNumRows(), (int) right.getNumRows(), (int) right.getNumColumns(), !isLeftTransposed, isRightTransposed, ADenseTransposed, BDense);
gCtx.cudaFreeHelper(instName, ADenseTransposed);
}
}
}
Aggregations