Search in sources :

Example 36 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method axpy.

/**
	 * Performs daxpy operation
	 *
	 * @param ec execution context
	 * @param gCtx a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param in1 input matrix 1
	 * @param in2 input matrix 2
	 * @param outputName output matrix name
	 * @param constant pointer constant
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, double constant) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    Pointer A = getDensePointer(gCtx, in1, instName);
    Pointer B = getDensePointer(gCtx, in2, instName);
    MatrixObject out = ec.getMatrixObject(outputName);
    // Allocated the dense output matrix
    getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
    Pointer C = getDensePointer(gCtx, out, instName);
    long t1 = 0, t2 = 0;
    if (in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
        LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
        // Matrix-Matrix daxpy
        // Since A is always a matrix
        long n = in1.getNumRows() * in2.getNumColumns();
        Pointer alphaPtr = pointerTo(constant);
        // C <- alpha*B + C
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        cudaMemcpy(C, A, n * ((long) jcuda.Sizeof.DOUBLE), cudaMemcpyDeviceToDevice);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        JCublas2.cublasDaxpy(getCublasHandle(gCtx), (int) n, alphaPtr, B, 1, C, 1);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
    } else {
        LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
        // daxpy_matrix_vector(double* A,  double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        int rlenA = (int) in1.getNumRows();
        int clenA = (int) in1.getNumColumns();
        int rlenB = (int) in2.getNumRows();
        int clenB = (int) in2.getNumColumns();
        getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 37 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method sparseDenseMatmult.

/**
	 * * C = op(A) * op(B) where A is sparse and B is dense
	 * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
	 * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param output the output matrix object
	 * @param left matrix A
	 * @param right matrix B
	 * @param isLeftTransposed if A needs to be transposed
	 * @param isRightTransposed if B needs to be transposed
	 * @param m ?
	 * @param n ?
	 * @param k ?
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
    CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    Pointer BDense = getDensePointer(gCtx, right, instName);
    if (n == 1) {
        // Sparse Matrix - Dense Vector multiply
        sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int) left.getNumRows(), (int) left.getNumColumns());
    } else {
        long t0 = 0, t1 = 0, t2 = 0;
        // Sparse Matrix Dense Matrix multiply
        if (A.isUltraSparse(m, k)) {
            LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx);
            // Convert right to CSR and do cuSparse matmul
            int rowsB = (int) right.getNumRows();
            int colsB = (int) right.getNumColumns();
            if (DMLScript.STATISTICS)
                t0 = System.nanoTime();
            Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
            if (GPUStatistics.DISPLAY_STATISTICS)
                t1 = System.nanoTime();
            CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
            if (DMLScript.STATISTICS)
                GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
            if (DMLScript.STATISTICS)
                GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
            sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
            if (GPUStatistics.DISPLAY_STATISTICS)
                t2 = System.nanoTime();
            B.deallocate();
            gCtx.cudaFreeHelper(BT);
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
        } else {
            LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
            // Note the arguments to denseDenseMatmult to accommodate for this.
            if (DMLScript.STATISTICS)
                t0 = System.nanoTime();
            Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int) left.getNumRows(), (int) left.getNumColumns());
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
            if (DMLScript.STATISTICS)
                GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
            if (DMLScript.STATISTICS)
                GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
            if (GPUStatistics.DISPLAY_STATISTICS)
                t1 = System.nanoTime();
            // To allocate the dense matrix
            boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
            if (allocated && GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
            Pointer C = getDensePointer(gCtx, output, instName);
            denseDenseMatmult(gCtx, instName, C, (int) left.getNumColumns(), (int) left.getNumRows(), (int) right.getNumRows(), (int) right.getNumColumns(), !isLeftTransposed, isRightTransposed, ADenseTransposed, BDense);
            gCtx.cudaFreeHelper(instName, ADenseTransposed);
        }
    }
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)

Example 38 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class GPUContext method cudaFreeHelper.

/**
   * Does cudaFree calls, lazily
   *
   * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
   * @param toFree          {@link Pointer} instance to be freed
   * @param eager           true if to be done eagerly
   */
public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
    Pointer dummy = new Pointer();
    if (// trying to free a null pointer
    toFree == dummy)
        return;
    long t0 = 0;
    assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
    long size = cudaBlockSizeMap.get(toFree);
    if (eager) {
        LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this);
        if (DMLScript.STATISTICS)
            t0 = System.nanoTime();
        cudaFree(toFree);
        cudaBlockSizeMap.remove(toFree);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaDeAllocCount.addAndGet(1);
        if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
    } else {
        LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
        LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
        if (freeList == null) {
            freeList = new LinkedList<Pointer>();
            freeCUDASpaceMap.put(size, freeList);
        }
        if (freeList.contains(toFree))
            throw new RuntimeException("GPU : Internal state corrupted, double free");
        freeList.add(toFree);
    }
}
Also used : DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) Pointer(jcuda.Pointer)

Example 39 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class GPUContext method allocate.

/**
   * Allocates temporary space on the device.
   * Does not update bookkeeping.
   * The caller is responsible for freeing up after usage.
   *
   * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
   * @param size            Size of data (in bytes) to allocate
   * @param statsCount      amount to increment the cudaAllocCount by
   * @return jcuda Pointer
   * @throws DMLRuntimeException if DMLRuntimeException occurs
   */
public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
    long t0 = 0, t1 = 0, end = 0;
    Pointer A;
    if (freeCUDASpaceMap.containsKey(size)) {
        LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this);
        if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
        A = freeList.pop();
        if (freeList.isEmpty())
            freeCUDASpaceMap.remove(size);
        if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
    } else {
        LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
        if (DMLScript.STATISTICS)
            t0 = System.nanoTime();
        ensureFreeSpace(instructionName, size);
        A = new Pointer();
        cudaMalloc(A, size);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
        if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
    }
    // Set all elements to 0 since newly allocated space will contain garbage
    if (DMLScript.STATISTICS)
        t1 = System.nanoTime();
    LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
    cudaMemset(A, 0, size);
    if (DMLScript.STATISTICS)
        end = System.nanoTime();
    if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
    if (DMLScript.STATISTICS)
        GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
    if (DMLScript.STATISTICS)
        GPUStatistics.cudaMemSet0Count.getAndAdd(1);
    cudaBlockSizeMap.put(A, size);
    return A;
}
Also used : Pointer(jcuda.Pointer)

Example 40 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class GPUContext method evict.

/**
   * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up
   * or it fails.
   * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix
   * blocks with zero locks on them is freed up.
   * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
   * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
   * blocks on which there are zero locks until the required size has been freed up.
   * // TODO: update it with hybrid policy
   *
   * @param instructionName name of the instruction for which performance measurements are made
   * @param neededSize      desired size to be freed up on the GPU
   * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them.
   */
protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
    LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
    GPUStatistics.cudaEvictionCount.addAndGet(1);
    // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
    // to free up space
    LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
    while (lruCacheMap.size() > 0) {
        if (neededSize <= getAvailableMemory())
            break;
        Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
        LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
        Long size = toFreeListPair.getKey();
        Pointer toFree = toFreeList.pop();
        if (toFreeList.isEmpty())
            lruCacheMap.remove(size);
        cudaFreeHelper(instructionName, toFree, true);
    }
    if (neededSize <= getAvailableMemory())
        return;
    if (allocatedGPUObjects.size() == 0) {
        throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
    }
    Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {

        @Override
        public int compare(GPUObject p1, GPUObject p2) {
            long p1Val = p1.locks.get();
            long p2Val = p2.locks.get();
            if (p1Val > 0 && p2Val > 0) {
                // Both are locked, so don't sort
                return 0;
            } else if (p1Val > 0 || p2Val > 0) {
                // Put the unlocked one to RHS
                return Long.compare(p2Val, p1Val);
            } else {
                if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
                    long p1Size = 0;
                    long p2Size = 0;
                    try {
                        p1Size = p1.getSizeOnDevice() - neededSize;
                        p2Size = p2.getSizeOnDevice() - neededSize;
                    } catch (DMLRuntimeException e) {
                        throw new RuntimeException(e);
                    }
                    if (p1Size >= 0 && p2Size >= 0) {
                        return Long.compare(p2Size, p1Size);
                    } else {
                        return Long.compare(p1Size, p2Size);
                    }
                } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
                    return Long.compare(p2.timestamp.get(), p1.timestamp.get());
                } else {
                    throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
                }
            }
        }
    });
    while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
        GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
        if (toBeRemoved.locks.get() > 0) {
            throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
        }
        if (toBeRemoved.dirty) {
            toBeRemoved.copyFromDeviceToHost();
        }
        toBeRemoved.clearData(true);
    }
}
Also used : Pointer(jcuda.Pointer) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LRUCacheMap(org.apache.sysml.utils.LRUCacheMap) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Pointer (jcuda.Pointer)44 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)33 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)26 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)11 CudaException (jcuda.CudaException)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 HashMap (java.util.HashMap)2 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 GPUObject (org.apache.sysml.runtime.instructions.gpu.context.GPUObject)2 LeftScalarOperator (org.apache.sysml.runtime.matrix.operators.LeftScalarOperator)2 RightScalarOperator (org.apache.sysml.runtime.matrix.operators.RightScalarOperator)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 CUfunction (jcuda.driver.CUfunction)1 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1 jcuda.jcusparse.cusparseMatDescr (jcuda.jcusparse.cusparseMatDescr)1 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)1 CM (org.apache.sysml.runtime.functionobjects.CM)1