use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method biasAdd.
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param bias bias
* @param outputBlock output
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void biasAdd(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
Pointer imagePointer = getDensePointer(gCtx, input, instName);
Pointer biasPointer = getDensePointer(gCtx, bias, instName);
Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
int rows = (int) input.getNumRows();
int cols = (int) input.getNumColumns();
int K = (int) bias.getNumRows();
if (bias.getNumColumns() != 1 || cols % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
biasAdd(gCtx, instName, imagePointer, biasPointer, outputPointer, rows, cols, K);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class CSRPointer method toColumnMajorDenseMatrix.
/**
* Copies this CSR matrix on the GPU to a dense column-major matrix
* on the GPU. This is a temporary matrix for operations such as
* cusparseDcsrmv.
* Since the allocated matrix is temporary, bookkeeping is not updated.
* The caller is responsible for calling "free" on the returned Pointer object
*
* @param cusparseHandle a valid {@link cusparseHandle}
* @param cublasHandle a valid {@link cublasHandle}
* @param rows number of rows in this CSR matrix
* @param cols number of columns in this CSR matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @return A {@link Pointer} to the allocated dense matrix (in column-major format)
*/
public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext());
long size = ((long) rows) * getDoubleSizeOf((long) cols);
Pointer A = allocate(size);
// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
if (val != null && rowPtr != null && colInd != null && nnz > 0) {
// Note: cusparseDcsr2dense method cannot handle empty blocks
cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
//cudaDeviceSynchronize;
} else {
LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
}
return A;
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method conv2d.
public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
cudnnFilterDescriptor filterDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
try {
long t1 = 0, t2 = 0;
// Allocate descriptors
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
filterDesc = allocateFilterDescriptor(K, C, R, S);
int[] padding = { pad_h, pad_w };
int[] strides = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
// Select the best algorithm depending on the data and supported CUDA
int algo = -1;
workSpace = new Pointer();
if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
int[] algos = { jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM, jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM };
// TODO: Look into FFt, Winograd, etc
// Also ensure that GPU has enough memory to allocate memory
long[] sizeInBytesArray = { 0 };
algo = jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray);
if (sizeInBytesArray[0] != 0)
workSpace = gCtx.allocate(sizeInBytesArray[0]);
sizeInBytes = sizeInBytesArray[0];
} else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
} else {
throw new DMLRuntimeException("Unsupported preference criteria for convolution");
}
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(), srcTensorDesc, image, filterDesc, filter, convDesc, algo, workSpace, sizeInBytes, zero(), dstTensorDesc, output);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
}
} catch (CudaException e) {
throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
} finally {
long t3 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t3 = System.nanoTime();
if (filterDesc != null)
cudnnDestroyFilterDescriptor(filterDesc);
if (convDesc != null)
cudnnDestroyConvolutionDescriptor(convDesc);
if (workSpace != null && sizeInBytes != 0)
gCtx.cudaFreeHelper(instName, workSpace);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method matrixScalarOp.
/**
* Utility to do matrix-scalar operation kernel
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param ec execution context
* @param in input matrix
* @param outputName output variable name
* @param isInputTransposed true if input is transposed
* @param op operator
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if (isInputTransposed)
throw new DMLRuntimeException("Transposing the input is not supported");
int rlenA = (int) in.getNumRows();
int clenA = (int) in.getNumColumns();
// TODO: FIXME: Implement sparse binCellSparseScalarOp kernel
Pointer A = getDensePointer(gCtx, in, instName);
double scalar = op.getConstant();
MatrixObject out = ec.getMatrixObject(outputName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer C = getDensePointer(gCtx, out, instName);
matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op);
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method compareAndSet.
@SuppressWarnings("unused")
private static void compareAndSet(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, double compareVal, double tolerance, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
// TODO: FIXME: Implement sparse kernel
Pointer A = getDensePointer(gCtx, in, instName);
MatrixObject out = ec.getMatrixObject(outputName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer ret = getDensePointer(gCtx, out, instName);
int rlen = (int) out.getNumRows();
int clen = (int) out.getNumColumns();
// out.getMatrixCharacteristics().setNonZeros(rlen*clen);
// compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double ifEqualsVal, double ifNotEqualsVal)
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("compare_and_set", ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen), A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COMPARE_AND_SET_KERNEL, System.nanoTime() - t0);
}
Aggregations