Search in sources :

Example 1 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class GPUObject method denseRowMajorToColumnMajor.

/**
	 * Convenience method. Converts Row Major Dense Matrix to Column Major Dense Matrix
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public void denseRowMajorToColumnMajor() throws DMLRuntimeException {
    LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
    int m = toIntExact(mat.getNumRows());
    int n = toIntExact(mat.getNumColumns());
    int lda = n;
    int ldc = m;
    if (!isAllocated()) {
        throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
    }
    Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc);
    cudaFreeHelper(getJcudaDenseMatrixPtr());
    setDenseMatrixCudaPointer(tmp);
}
Also used : Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 2 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class GPUContext method clearTemporaryMemory.

/**
   * Clears up the memory used to optimize cudaMalloc/cudaFree calls
   */
public void clearTemporaryMemory() {
    // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
    HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
    for (GPUObject o : allocatedGPUObjects) {
        if (o.isSparse()) {
            CSRPointer p = o.getSparseMatrixCudaPointer();
            if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) {
                tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr));
            }
            if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) {
                tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd));
            }
            if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) {
                tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val));
            }
        } else {
            Pointer p = o.getJcudaDenseMatrixPtr();
            tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
        }
    }
    // garbage collect all temporarily allocated spaces
    for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
        for (Pointer p : l) {
            cudaFreeHelper(p, true);
        }
    }
    cudaBlockSizeMap.clear();
    freeCUDASpaceMap.clear();
    // Restore only those entries for which there are still blocks on the GPU
    cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
}
Also used : HashMap(java.util.HashMap) Pointer(jcuda.Pointer)

Example 3 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class GPUObject method transpose.

/**
	 * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
	 * @param gCtx   a valid {@link GPUContext}
	 * @param densePtr	Pointer to dense matrix on the GPU
	 * @param m			rows in ouput matrix
	 * @param n			columns in output matrix
	 * @param lda		rows in input matrix
	 * @param ldc		columns in output matrix
	 * @return			transposed matrix
	 * @throws DMLRuntimeException if operation failed
	 */
public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
    LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
    Pointer alpha = Pointer.to(new double[] { 1.0 });
    Pointer beta = Pointer.to(new double[] { 0.0 });
    Pointer A = densePtr;
    Pointer C = gCtx.allocate(((long) m) * getDoubleSizeOf(n));
    // Transpose the matrix to get a dense matrix
    JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
    return C;
}
Also used : Pointer(jcuda.Pointer)

Example 4 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method matmultTSMM.

//********************************************************************/
//************* End of DEEP LEARNING Operators ***********************/
//********************************************************************/
//********************************************************************/
//********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/
//********************************************************************/
/**
	 * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...)
	 *
	 * @param ec execution context
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left'
	 * @param outputName output matrix name
	 * @param isLeftTransposed if true, left transposed
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, boolean isLeftTransposed) throws DMLRuntimeException {
    LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    if (isInSparseFormat(gCtx, left)) {
        // For sparse TSMM, invoke matmult (TODO: possible performance improvement)
        matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
        return;
    }
    // For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix
    MatrixObject output = ec.getMatrixObject(outputName);
    // Allocated the dense output matrix
    getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
    // Since CuBLAS expects inputs in column-major format,
    // reverse the order of matrix-multiplication and take care of dimension mismatch.
    int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
    // Note: the dimensions are swapped
    int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
    int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
    if (m == -1)
        throw new DMLRuntimeException("Incorrect dimensions");
    int lda = (int) (isLeftTransposed ? m : k);
    int ldc = m;
    if (!left.getGPUObject(gCtx).isAllocated())
        throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated());
    if (!output.getGPUObject(gCtx).isAllocated())
        throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated());
    Pointer A = getDensePointer(gCtx, left, instName);
    Pointer C = getDensePointer(gCtx, output, instName);
    long t0 = 0, t1 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    JCublas2.cublasDsyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER, transa, m, k, one(), A, lda, zero(), C, ldc);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0);
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    copyUpperToLowerTriangle(gCtx, instName, output);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1);
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 5 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method solve.

/**
     * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
     *
     * @param ec         a valid {@link ExecutionContext}
     * @param gCtx       a valid {@link GPUContext}
     * @param instName   the invoking instruction's name for record {@link Statistics}.
     * @param in1        input matrix A
     * @param in2        input matrix B
     * @param outputName name of the output matrix
     * @throws DMLRuntimeException if an error occurs
     */
public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    // x = solve(A, b)
    LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
    long t0 = -1;
    if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {
        // Both dense
        GPUObject Aobj = in1.getGPUObject(gCtx);
        GPUObject bobj = in2.getGPUObject(gCtx);
        int m = (int) in1.getNumRows();
        int n = (int) in1.getNumColumns();
        if ((int) in2.getNumRows() != m)
            throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
        if ((int) in2.getNumColumns() != 1)
            throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
        // and are destructive to the original input
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        GPUObject ATobj = (GPUObject) Aobj.clone();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        ATobj.denseRowMajorToColumnMajor();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
        Pointer A = ATobj.getJcudaDenseMatrixPtr();
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        GPUObject bTobj = (GPUObject) bobj.clone();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        bTobj.denseRowMajorToColumnMajor();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
        Pointer b = bTobj.getJcudaDenseMatrixPtr();
        // step 3: query working space of geqrf and ormqr
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        int[] lwork = { 0 };
        JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
        // step 4: compute QR factorization
        Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
        Pointer tau = gCtx.allocate(instName, Math.max(m, m) * Sizeof.DOUBLE);
        Pointer devInfo = gCtx.allocate(Sizeof.INT);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
        int[] qrError = { -1 };
        cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
        if (qrError[0] != 0) {
            throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
        }
        // step 5: compute Q^T*B
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
        cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
        if (qrError[0] != 0) {
            throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
        }
        // step 6: compute x = R \ Q^T*B
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        JCublas2.cublasDtrsm(gCtx.getCublasHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, n, 1, pointerTo(1.0), A, m, b, m);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        bTobj.denseColumnMajorToRowMajor();
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
        // TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
        // There is an avoidable copy happening here
        MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
        gCtx.cudaFreeHelper(instName, work);
        gCtx.cudaFreeHelper(instName, tau);
        ATobj.clearData();
        bTobj.clearData();
    //debugPrintMatrix(b, n, 1);
    } else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) {
        // Both sparse
        throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
    } else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) {
        // Pointer B = getDensePointer(gCtx, in2, instName);
        throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
    } else if (isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {
        // A is sparse, b is dense
        throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) GPUObject(org.apache.sysml.runtime.instructions.gpu.context.GPUObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

Pointer (jcuda.Pointer)44 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)33 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)26 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)11 CudaException (jcuda.CudaException)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 HashMap (java.util.HashMap)2 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 GPUObject (org.apache.sysml.runtime.instructions.gpu.context.GPUObject)2 LeftScalarOperator (org.apache.sysml.runtime.matrix.operators.LeftScalarOperator)2 RightScalarOperator (org.apache.sysml.runtime.matrix.operators.RightScalarOperator)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 CUfunction (jcuda.driver.CUfunction)1 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1 jcuda.jcusparse.cusparseMatDescr (jcuda.jcusparse.cusparseMatDescr)1 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)1 CM (org.apache.sysml.runtime.functionobjects.CM)1