Search in sources :

Example 6 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method denseDenseMatmult.

/**
	 * Dense dense matrix multiply
	 * C = op(A) * op(B), A and B are dense matrices
	 * @param gCtx              a valid {@link GPUContext}
	 * @param instName          name of the invoking instruction to record{@link Statistics}.
	 * @param output            output object C on host with GPU data allocated
	 * @param left              left matrix A (in row-major order)
	 * @param right             right matrix B (in row-major order)
	 * @param isLeftTransposed  op for A, transposed or not
	 * @param isRightTransposed op for B, transposed or not
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
    Pointer leftPtr = getDensePointer(gCtx, left, instName);
    Pointer rightPtr = getDensePointer(gCtx, right, instName);
    int leftRows = (int) left.getNumRows();
    int leftCols = (int) left.getNumColumns();
    int rightRows = (int) right.getNumRows();
    int rightCols = (int) right.getNumColumns();
    Pointer C = getDensePointer(gCtx, output, instName);
    denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed, leftPtr, rightPtr);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Example 7 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method maxpoolingBackward.

/**
	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
	 * This method computes the backpropogation errors for previous layer of maxpooling operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param image image as matrix object
	 * @param dout			delta matrix, output of previous layer
	 * @param outputBlock output matrix
	 * @param N				batch size
	 * @param C				number of channels
	 * @param H				height of image
	 * @param W				width of image
	 * @param K				number of filters
	 * @param R				height of filter
	 * @param S				width of filter
	 * @param pad_h			vertical padding
	 * @param pad_w			horizontal padding
	 * @param stride_h		horizontal stride
	 * @param stride_w		vertical stride
	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException {
    LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
    Pointer y = null;
    cudnnPoolingDescriptor poolingDesc = null;
    try {
        long t1 = 0, t2 = 0, t3 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // Allocate descriptors
        cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
        cudnnTensorDescriptor yDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
        cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(gCtx, outputBlock, N, C, H, W);
        cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
        // Calling PoolForward first, y is one of the inputs for poolBackward
        // TODO: Remove calling poolForward after necessary changes at language level for poolBackward
        long numBytes = N * C * P * Q * Sizeof.DOUBLE;
        y = gCtx.allocate(numBytes);
        // Allocate data
        Pointer x = getDensePointer(gCtx, image, true, instName);
        Pointer dx = getDensePointer(gCtx, outputBlock, true, instName);
        Pointer dy = getDensePointer(gCtx, dout, true, instName);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        int status = cudnnPoolingForward(getCudnnHandle(gCtx), poolingDesc, one(), xDesc, x, zero(), yDesc, y);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
        if (GPUStatistics.DISPLAY_STATISTICS)
            t3 = System.nanoTime();
        status = cudnnPoolingBackward(getCudnnHandle(gCtx), poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
        if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
            throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t4 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t4 = System.nanoTime();
        if (y != null)
            gCtx.cudaFreeHelper(instName, y);
        if (poolingDesc != null)
            cudnnDestroyPoolingDescriptor(poolingDesc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
    }
}
Also used : jcuda.jcudnn.cudnnPoolingDescriptor(jcuda.jcudnn.cudnnPoolingDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 8 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method performCuDNNReLU.

private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
    long t0 = 0;
    try {
        LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
        cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
        Pointer srcData = getDensePointer(gCtx, in, true, instName);
        cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
        cudnnCreateActivationDescriptor(activationDescriptor);
        double dummy = -1;
        cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor, one(), srcTensorDesc, srcData, zero(), dstTensorDesc, dstData);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
    } catch (CudaException e) {
        throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
    } finally {
        long t1 = 0;
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
    }
}
Also used : jcuda.jcudnn.cudnnActivationDescriptor(jcuda.jcudnn.cudnnActivationDescriptor) CudaException(jcuda.CudaException) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 9 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method batchNormalizationForwardTraining.

/**
	 * Performs the forward BatchNormalization layer computation for training
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName name of the instruction
	 * @param image input image
	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
	 * @param ret (output) normalized input
	 * @param retRunningMean (output) running mean accumulated during training phase: shape [1, C, 1, 1]
	 * @param retRunningVar (output) running variance accumulated during training phase: shape [1, C, 1, 1]
	 * @param epsilon epsilon value used in the batch normalization formula
	 * @param exponentialAverageFactor factor used in the moving average computation
	 * @throws DMLRuntimeException if error occurs
	 */
public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
    LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
    int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
    int N = (int) image.getNumRows();
    int C = (int) scale.getNumColumns();
    long CHW = image.getNumColumns();
    validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
    // Allocate descriptors
    cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[] { image }, new MatrixObject[] { ret });
    cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
    // Get underlying dense pointer
    Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
    Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
    Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
    Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
    Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
    Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
    // To allow for copy-on-write
    Pointer retRunningMeanPtr = getDensePointer(gCtx, retRunningMean, true, instName);
    Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, true, instName);
    cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
    cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
    // ignoring resultSaveMean and resultSaveVariance as it requires state management
    checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor, retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
}
Also used : jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Example 10 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method matrixMatrixOp.

/**
	 * Utility to launch binary cellwise matrix-matrix operations CUDA kernel
	 * @param gCtx a valid {@link GPUContext}
	 * @param ec execution context
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param in1 left input matrix
	 * @param in2 right input matrix
	 * @param outputName output variable name
	 * @param isLeftTransposed true if left matrix is transposed
	 * @param isRightTransposed true if right matrix is transposed
	 * @param op operator
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    boolean isEmpty1 = isSparseAndEmpty(gCtx, in1);
    boolean isEmpty2 = isSparseAndEmpty(gCtx, in2);
    int rlenA = (int) in1.getNumRows();
    int rlenB = (int) in2.getNumRows();
    int clenA = (int) in1.getNumColumns();
    int clenB = (int) in2.getNumColumns();
    int vecStatusA = getVectorStatus(rlenA, clenA).code();
    int vecStatusB = getVectorStatus(rlenB, clenB).code();
    if (isEmpty1 && isEmpty2) {
        MatrixObject out = ec.getMatrixObject(outputName);
        ec.allocateGPUMatrixObject(outputName);
        // When both inputs are empty, the output is empty too (except in the case of division)
        if (op.fn instanceof Divide) {
            out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN);
        } else {
            out.getGPUObject(gCtx).allocateSparseAndEmpty();
        }
    } else // Check for M1 * M2 when M1 is empty; if M2 is a vector then fallback to general case
    if (isEmpty1 && clenB != 1 && rlenB != 1) {
        // C = empty_in1 op in2 ==> becomes ==> C = 0.0 op in2
        matrixScalarArithmetic(ec, gCtx, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
    } else // Check for M1 * M2 when M2 is empty; if M1 is a vector then fallback to general case
    if (isEmpty2 && clenA != 1 && rlenA != 1) {
        // C = in1 op empty_in2 ==> becomes ==> C = in1 op 0.0
        matrixScalarArithmetic(ec, gCtx, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
    } else {
        // TODO: FIXME: Implement sparse binCellSparseOp kernel
        Pointer A = getDensePointer(gCtx, in1, instName);
        // TODO: FIXME: Implement sparse binCellSparseOp kernel
        Pointer B = getDensePointer(gCtx, in2, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        // Allocated the dense output matrix
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer C = getDensePointer(gCtx, out, instName);
        int maxRlen = Math.max(rlenA, rlenB);
        int maxClen = Math.max(clenA, clenB);
        matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
    }
}
Also used : Divide(org.apache.sysml.runtime.functionobjects.Divide) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) LeftScalarOperator(org.apache.sysml.runtime.matrix.operators.LeftScalarOperator) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) RightScalarOperator(org.apache.sysml.runtime.matrix.operators.RightScalarOperator) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

Pointer (jcuda.Pointer)44 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)33 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)26 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)11 CudaException (jcuda.CudaException)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 HashMap (java.util.HashMap)2 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 GPUObject (org.apache.sysml.runtime.instructions.gpu.context.GPUObject)2 LeftScalarOperator (org.apache.sysml.runtime.matrix.operators.LeftScalarOperator)2 RightScalarOperator (org.apache.sysml.runtime.matrix.operators.RightScalarOperator)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 CUfunction (jcuda.driver.CUfunction)1 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1 jcuda.jcusparse.cusparseMatDescr (jcuda.jcusparse.cusparseMatDescr)1 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)1 CM (org.apache.sysml.runtime.functionobjects.CM)1