use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method batchNormalizationForwardInference.
/**
* Performs the forward BatchNormalization layer computation for inference
* @param gCtx a valid {@link GPUContext}
* @param instName name of the instruction
* @param image input image
* @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
* @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
* @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
* @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
* @param ret normalized input
* @param epsilon epsilon value used in the batch normalization formula
* @throws DMLRuntimeException if error occurs
*/
public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException {
LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[] { image }, new MatrixObject[] { ret });
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, scaleTensorDesc, scalePtr, biasPtr, runningMeanPtr, runningVarPtr, epsilon));
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method relu.
/**
* Performs the relu operation on the GPU.
* @param ec currently active {@link ExecutionContext}
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName name of the output matrix
* @throws DMLRuntimeException if an error occurs
*/
public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
long N = in.getNumRows();
long CHW = in.getNumColumns();
MatrixObject output = ec.getMatrixObject(outputName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
long t0 = 0;
cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
if (N * CHW >= numDoublesIn2GB || srcTensorDesc == null) {
LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
// Invokes relu(double* A, double* ret, int rlen, int clen)
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
Pointer dstData = getDensePointer(gCtx, output, instName);
// TODO: FIXME: Add sparse kernel support for relu
Pointer srcData = getDensePointer(gCtx, in, instName);
getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations((int) N, (int) CHW), srcData, dstData, (int) N, (int) CHW);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
} else {
performCuDNNReLU(gCtx, instName, in, getDensePointer(gCtx, output, true, instName), srcTensorDesc);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method unaryOp.
/**
* A helper function for all Unary ops (sqrt, abs, sin.. etc)
* @param ec valid execution context
* @param gCtx a valid {@link GPUContext}
* @param in1 input matrix
* @param kernel name of CUDA kernel for the unary op to execute
* @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block
* @param outputName output matrix name
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param kernelTimer the name of the timer to measure the kernel invocation
* @throws DMLRuntimeException
*/
private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
GPUObject in = in1.getGPUObject(gCtx);
boolean isSparseAndEmpty = in.isSparseAndEmpty();
long t1 = 0;
if (isSparseAndEmpty) {
MatrixObject out = ec.getMatrixObject(outputName);
ec.allocateGPUMatrixObject(outputName);
out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
} else {
// Dense
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer output = getDensePointer(gCtx, out, instName);
Pointer input = getDensePointer(gCtx, in1, instName);
int size = (int) (in1.getNumColumns() * in1.getNumRows());
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method conv2dBackwardFilter.
/**
* This method computes the backpropogation errors for filter of convolution operation
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param image input image
* @param dout errors from next layer
* @param outputBlock output errors
* @param N number of images
* @param C number of channels
* @param H height
* @param W width
* @param K number of filters
* @param R filter height
* @param S filter width
* @param pad_h pad height
* @param pad_w pad width
* @param stride_h stride height
* @param stride_w stride width
* @param P output activation height
* @param Q output activation width
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
cudnnFilterDescriptor dwDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
try {
long t1 = 0, t2 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
// Allocate descriptors
cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
dwDesc = allocateFilterDescriptor(K, C, R, S);
// Allocate data
Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
Pointer doutPointer = getDensePointer(gCtx, dout, true, instName);
Pointer dwPointer = getDensePointer(gCtx, outputBlock, true, instName);
int[] padding = { pad_h, pad_w };
int[] strides = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
long[] sizeInBytesArray = { 0 };
// TODO: Select the best algorithm depending on the data and supported CUDA
int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
workSpace = new Pointer();
cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx), xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer, doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
} catch (CudaException e) {
throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
} finally {
long t3 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t3 = System.nanoTime();
if (workSpace != null && sizeInBytes != 0)
gCtx.cudaFreeHelper(instName, workSpace);
if (dwDesc != null)
cudnnDestroyFilterDescriptor(dwDesc);
if (convDesc != null)
cudnnDestroyConvolutionDescriptor(convDesc);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseMatrixVectorMult.
/**
* Does a sparse matrix-vector multiply.
* C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output object C to which the GPU output matrix will be attached
* @param isATranposed if A is to be transposed or not (the op in op(A))
* @param m number of rows in A (not op(A))
* @param n number of cols in A (not op(A))
* @param k number of rows in B, (cols in B is assumed to be 1)
* @param A left sparse matrix on GPU
* @param B right sparse vector on GPU
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k, CSRPointer A, CSRPointer B) throws DMLRuntimeException {
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
}
Aggregations