use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUObject method denseRowMajorToColumnMajor.
/**
* Convenience method. Converts Row Major Dense Matrix to Column Major Dense Matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public void denseRowMajorToColumnMajor() throws DMLRuntimeException {
LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
int m = toIntExact(mat.getNumRows());
int n = toIntExact(mat.getNumColumns());
int lda = n;
int ldc = m;
if (!isAllocated()) {
throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
}
Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
cudaFreeHelper(getJcudaDenseMatrixPtr());
setDenseMatrixCudaPointer(tmp);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUContext method clearTemporaryMemory.
/**
* Clears up the memory used to optimize cudaMalloc/cudaFree calls
*/
public void clearTemporaryMemory() {
// To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
for (GPUObject o : allocatedGPUObjects) {
if (o.isSparse()) {
CSRPointer p = o.getSparseMatrixCudaPointer();
if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) {
tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr));
}
if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) {
tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd));
}
if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) {
tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val));
}
} else {
Pointer p = o.getJcudaDenseMatrixPtr();
tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
}
}
// garbage collect all temporarily allocated spaces
for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
for (Pointer p : l) {
cudaFreeHelper(p, true);
}
}
cudaBlockSizeMap.clear();
freeCUDASpaceMap.clear();
// Restore only those entries for which there are still blocks on the GPU
cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUObject method transpose.
/**
* Transposes a dense matrix on the GPU by calling the cublasDgeam operation
* @param gCtx a valid {@link GPUContext}
* @param densePtr Pointer to dense matrix on the GPU
* @param m rows in ouput matrix
* @param n columns in output matrix
* @param lda rows in input matrix
* @param ldc columns in output matrix
* @return transposed matrix
* @throws DMLRuntimeException if operation failed
*/
public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
Pointer alpha = Pointer.to(new double[] { 1.0 });
Pointer beta = Pointer.to(new double[] { 0.0 });
Pointer A = densePtr;
Pointer C = gCtx.allocate(((long) m) * getDoubleSizeOf(n));
// Transpose the matrix to get a dense matrix
JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
return C;
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method matmultTSMM.
//********************************************************************/
//************* End of DEEP LEARNING Operators ***********************/
//********************************************************************/
//********************************************************************/
//********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/
//********************************************************************/
/**
* Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...)
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left'
* @param outputName output matrix name
* @param isLeftTransposed if true, left transposed
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, boolean isLeftTransposed) throws DMLRuntimeException {
LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if (isInSparseFormat(gCtx, left)) {
// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
return;
}
// For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix
MatrixObject output = ec.getMatrixObject(outputName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
// Since CuBLAS expects inputs in column-major format,
// reverse the order of matrix-multiplication and take care of dimension mismatch.
int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
// Note: the dimensions are swapped
int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
if (m == -1)
throw new DMLRuntimeException("Incorrect dimensions");
int lda = (int) (isLeftTransposed ? m : k);
int ldc = m;
if (!left.getGPUObject(gCtx).isAllocated())
throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated());
if (!output.getGPUObject(gCtx).isAllocated())
throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated());
Pointer A = getDensePointer(gCtx, left, instName);
Pointer C = getDensePointer(gCtx, output, instName);
long t0 = 0, t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCublas2.cublasDsyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER, transa, m, k, one(), A, lda, zero(), C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
copyUpperToLowerTriangle(gCtx, instName, output);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method solve.
/**
* Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
*
* @param ec a valid {@link ExecutionContext}
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix A
* @param in2 input matrix B
* @param outputName name of the output matrix
* @throws DMLRuntimeException if an error occurs
*/
public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
// x = solve(A, b)
LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
long t0 = -1;
if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {
// Both dense
GPUObject Aobj = in1.getGPUObject(gCtx);
GPUObject bobj = in2.getGPUObject(gCtx);
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
if ((int) in2.getNumRows() != m)
throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
if ((int) in2.getNumColumns() != 1)
throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
// and are destructive to the original input
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
GPUObject ATobj = (GPUObject) Aobj.clone();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
ATobj.denseRowMajorToColumnMajor();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
Pointer A = ATobj.getJcudaDenseMatrixPtr();
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
GPUObject bTobj = (GPUObject) bobj.clone();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
bTobj.denseRowMajorToColumnMajor();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
Pointer b = bTobj.getJcudaDenseMatrixPtr();
// step 3: query working space of geqrf and ormqr
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
int[] lwork = { 0 };
JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
// step 4: compute QR factorization
Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
Pointer tau = gCtx.allocate(instName, Math.max(m, m) * Sizeof.DOUBLE);
Pointer devInfo = gCtx.allocate(Sizeof.INT);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
int[] qrError = { -1 };
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 5: compute Q^T*B
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 6: compute x = R \ Q^T*B
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCublas2.cublasDtrsm(gCtx.getCublasHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, n, 1, pointerTo(1.0), A, m, b, m);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
bTobj.denseColumnMajorToRowMajor();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
// TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
// There is an avoidable copy happening here
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
gCtx.cudaFreeHelper(instName, work);
gCtx.cudaFreeHelper(instName, tau);
ATobj.clearData();
bTobj.clearData();
//debugPrintMatrix(b, n, 1);
} else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) {
// Both sparse
throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
} else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) {
// Pointer B = getDensePointer(gCtx, in2, instName);
throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
} else if (isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {
// A is sparse, b is dense
throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
}
}
Aggregations