use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUObject method denseColumnMajorToRowMajor.
/**
* Convenience method. Converts Column Major Dense Matrix to Row Major Dense Matrix
* @throws DMLRuntimeException if error
*/
public void denseColumnMajorToRowMajor() throws DMLRuntimeException {
LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
int n = toIntExact(mat.getNumRows());
int m = toIntExact(mat.getNumColumns());
int lda = n;
int ldc = m;
if (!isAllocated()) {
throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
}
Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
cudaFreeHelper(getJcudaDenseMatrixPtr());
setDenseMatrixCudaPointer(tmp);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class JCudaKernels method launchKernel.
/**
* Setups the kernel parameters and launches the kernel using cuLaunchKernel API.
* This function currently supports two dimensional grid and blocks.
*
* @param name name of the kernel
* @param config execution configuration
* @param arguments can be of type Pointer, long, double, float and int
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public void launchKernel(String name, ExecutionConfig config, Object... arguments) throws DMLRuntimeException {
CUfunction function = kernels.get(name);
if (function == null) {
// caching functions into hashmap reduces the lookup overhead
function = new CUfunction();
checkResult(cuModuleGetFunction(function, module, name));
}
// Setup parameters
Pointer[] kernelParams = new Pointer[arguments.length];
for (int i = 0; i < arguments.length; i++) {
if (arguments[i] == null) {
throw new DMLRuntimeException("The argument to the kernel cannot be null.");
} else if (arguments[i] instanceof Pointer) {
kernelParams[i] = Pointer.to((Pointer) arguments[i]);
} else if (arguments[i] instanceof Integer) {
kernelParams[i] = Pointer.to(new int[] { (Integer) arguments[i] });
} else if (arguments[i] instanceof Double) {
kernelParams[i] = Pointer.to(new double[] { (Double) arguments[i] });
} else if (arguments[i] instanceof Long) {
kernelParams[i] = Pointer.to(new long[] { (Long) arguments[i] });
} else if (arguments[i] instanceof Float) {
kernelParams[i] = Pointer.to(new float[] { (Float) arguments[i] });
} else {
throw new DMLRuntimeException("The argument of type " + arguments[i].getClass() + " is not supported.");
}
}
// Launches the kernel using CUDA's driver API.
checkResult(cuLaunchKernel(function, config.gridDimX, config.gridDimY, config.gridDimZ, config.blockDimX, config.blockDimY, config.blockDimZ, config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null));
JCuda.cudaDeviceSynchronize();
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUObject method allocateDenseMatrixOnDevice.
void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
assert !isAllocated() : "Internal error - trying to allocated dense matrix to a GPUObject that is already allocated";
long rows = mat.getNumRows();
long cols = mat.getNumColumns();
assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
long size = getDoubleSizeOf(rows * cols);
Pointer tmp = allocate(size);
setDenseMatrixCudaPointer(tmp);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUObject method columnMajorDenseToRowMajorSparse.
/**
* Convenience method to convert a CSR matrix to a dense matrix on the GPU
* Since the allocated matrix is temporary, bookkeeping is not updated.
* Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT
* Caller is responsible for deallocating memory on GPU.
* @param gCtx a valid {@link GPUContext}
* @param cusparseHandle handle to cusparse library
* @param densePtr [in] dense matrix pointer on the GPU in row major
* @param rows number of rows
* @param cols number of columns
* @return CSR (compressed sparse row) pointer
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle, Pointer densePtr, int rows, int cols) throws DMLRuntimeException {
cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
Pointer nnzPerRowPtr = null;
Pointer nnzTotalDevHostPtr = null;
gCtx.ensureFreeSpace(getIntSizeOf(rows + 1));
nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows));
nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
// Output is in dense vector format, convert it to CSR
cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr);
//cudaDeviceSynchronize();
int[] nnzC = { -1 };
long t2 = 0;
if (DMLScript.STATISTICS)
t2 = System.nanoTime();
cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
if (DMLScript.STATISTICS)
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
if (DMLScript.STATISTICS)
GPUStatistics.cudaFromDevCount.addAndGet(1);
if (nnzC[0] == -1) {
throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
}
LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx);
CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
//cudaDeviceSynchronize();
gCtx.cudaFreeHelper(nnzPerRowPtr);
gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
return C;
}
Aggregations