use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method biasMultiply.
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param bias bias
* @param outputBlock output
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
if (isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).sparseToDense(instName);
}
if (isInSparseFormat(gCtx, bias)) {
bias.getGPUObject(gCtx).sparseToDense(instName);
}
long rows = input.getNumRows();
long cols = input.getNumColumns();
long K = bias.getNumRows();
long PQ = cols / K;
if (bias.getNumColumns() != 1 || cols % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations((int) rows, (int) cols), imagePointer, biasPointer, outputPointer, (int) rows, (int) cols, (int) PQ);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method conv2dBackwardData.
/**
* This method computes the backpropogation errors for previous layer of convolution operation
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param filter filter used in conv2d
* @param dout errors from next layer
* @param output output errors
* @param N number of images
* @param C number of channels
* @param H height
* @param W width
* @param K number of filters
* @param R filter height
* @param S filter width
* @param pad_h pad height
* @param pad_w pad width
* @param stride_h stride height
* @param stride_w stride width
* @param P output activation height
* @param Q output activation width
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
cudnnFilterDescriptor wDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
try {
long t1 = 0, t2 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
// Allocate descriptors
wDesc = allocateFilterDescriptor(K, C, R, S);
cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, output, N, C, H, W);
// Allocate data
Pointer w = getDensePointer(gCtx, filter, true, instName);
Pointer dy = getDensePointer(gCtx, dout, true, instName);
Pointer dx = getDensePointer(gCtx, output, true, instName);
int[] padding = { pad_h, pad_w };
int[] strides = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
long[] sizeInBytesArray = { 0 };
// TODO: Select the best algorithm depending on the data and supported CUDA
int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
workSpace = new Pointer();
cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx), wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w, dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
} catch (CudaException e) {
throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
} finally {
long t3 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t3 = System.nanoTime();
if (workSpace != null && sizeInBytes != 0)
gCtx.cudaFreeHelper(instName, workSpace);
if (wDesc != null)
cudnnDestroyFilterDescriptor(wDesc);
if (convDesc != null)
cudnnDestroyConvolutionDescriptor(convDesc);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseMatrixDenseVectorMult.
/**
* C = op(A) x B
* A is a sparse matrix, B is a dense vector
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output on the host, to which the GPU output C will be attached
* @param A sparse matrix A on the GPU
* @param B_dense dense matrix/vector B on the GPU
* @param isATranposed op for A, tranposed or not
* @param m number of rows in A (not op(A))
* @param k number of cols in A or number of rows in B (not op(A) or op(B))
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, int m, int k) throws DMLRuntimeException {
LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
long size = m * Sizeof.DOUBLE;
if (isATranposed) {
size = k * Sizeof.DOUBLE;
transA = CUSPARSE_OPERATION_TRANSPOSE;
}
Pointer C_dense = gCtx.allocate(instName, (int) size);
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int) A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
//cudaDeviceSynchronize; // Since cusparseDcsrmv is asynchronously executed
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method denseDenseMatmult.
/**
* Dense-dense matrix multiply
* C = op(A) * op(B), A and B are dense matrices
* On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
* What we have as input is t(A) and t(B), t(X) = transpose of X.
* We do t(B) %*% t(A) to get t(C);
* If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
* What we really want is t(C). This we already have as the result of t(B) %*% t(A).
* @param gCtx a valid {@link GPUContext}
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param output output allocated on GPU in column major format
* @param leftRows1 number of rows in A
* @param leftCols1 number of cols in A
* @param rightRows1 number of rows in B
* @param rightCols1 number of cols in B
* @param isLeftTransposed1 op for A, transposed or not
* @param isRightTransposed1 op for B, transposed or not
* @param leftPtr A allocated on the GPU in row-major format
* @param rightPtr B allocated on the GPU in row-major format
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException {
LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
Pointer A = rightPtr;
Pointer B = leftPtr;
// To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects)
int leftRows = rightCols1;
int leftCols = rightRows1;
int rightRows = leftCols1;
int rightCols = leftRows1;
boolean isLeftTransposed = isRightTransposed1;
boolean isRightTransposed = isLeftTransposed1;
// Note: the dimensions are swapped
int m = isLeftTransposed ? leftCols : leftRows;
int n = isRightTransposed ? rightRows : rightCols;
int k = isLeftTransposed ? leftRows : leftCols;
int k1 = isRightTransposed ? rightCols : rightRows;
if (k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if (m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
double[] one = { 1 };
double[] zero = { 0 };
//int lda = leftRows;
//int ldb = leftCols;
int lda = isLeftTransposed ? k : m;
int ldb = isRightTransposed ? n : k;
int ldc = m;
int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
Pointer C = output;
if (m == 1 && n == 1) {
// Vector product
LOG.debug(" GPU Dense-dense Vector Product");
double[] result = { 0 };
JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result));
// By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
// This means that scalar values passed are on host (as opposed to on device).
// The result is copied from the host back to the device so that the rest of
// infrastructure can treat it uniformly.
cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
} else if (m == 1) {
// Vector-matrix multiply
LOG.debug(" GPU Dense Vector-Matrix Multiply");
transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
} else if (n == 1) {
// Matrix-vector multiply
LOG.debug(" GPU Dense Matrix-Vector Multiply");
JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
} else {
LOG.debug(" GPU Dense-Dense Matrix Multiply ");
JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method reduceAll.
/**
* Do a simple reduction, the output of which is a single value
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to matrix in device memory
* @param n size of array
* @return the reduced value
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
int[] tmp = getKernelParamsForReduceAll(gCtx, n);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
long t1 = 0, t2 = 0, t3 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
int s = blocks;
while (s > 1) {
tmp = getKernelParamsForReduceAll(gCtx, s);
blocks = tmp[0];
threads = tmp[1];
sharedMem = tmp[2];
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
s = (s + (threads * 2 - 1)) / (threads * 2);
}
double[] result = { -1f };
if (GPUStatistics.DISPLAY_STATISTICS)
t3 = System.nanoTime();
cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
gCtx.cudaFreeHelper(instName, tempOut);
return result[0];
}
Aggregations