use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method axpy.
/**
* Performs daxpy operation
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix 1
* @param in2 input matrix 2
* @param outputName output matrix name
* @param constant pointer constant
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, double constant) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
MatrixObject out = ec.getMatrixObject(outputName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer C = getDensePointer(gCtx, out, instName);
long t1 = 0, t2 = 0;
if (in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
// Matrix-Matrix daxpy
// Since A is always a matrix
long n = in1.getNumRows() * in2.getNumColumns();
Pointer alphaPtr = pointerTo(constant);
// C <- alpha*B + C
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
cudaMemcpy(C, A, n * ((long) jcuda.Sizeof.DOUBLE), cudaMemcpyDeviceToDevice);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
JCublas2.cublasDaxpy(getCublasHandle(gCtx), (int) n, alphaPtr, B, 1, C, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
} else {
LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
// daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
int rlenA = (int) in1.getNumRows();
int clenA = (int) in1.getNumColumns();
int rlenB = (int) in2.getNumRows();
int clenB = (int) in2.getNumColumns();
getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class LibMatrixCUDA method sparseDenseMatmult.
/**
* * C = op(A) * op(B) where A is sparse and B is dense
* If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
* otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output the output matrix object
* @param left matrix A
* @param right matrix B
* @param isLeftTransposed if A needs to be transposed
* @param isRightTransposed if B needs to be transposed
* @param m ?
* @param n ?
* @param k ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer BDense = getDensePointer(gCtx, right, instName);
if (n == 1) {
// Sparse Matrix - Dense Vector multiply
sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int) left.getNumRows(), (int) left.getNumColumns());
} else {
long t0 = 0, t1 = 0, t2 = 0;
// Sparse Matrix Dense Matrix multiply
if (A.isUltraSparse(m, k)) {
LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx);
// Convert right to CSR and do cuSparse matmul
int rowsB = (int) right.getNumRows();
int colsB = (int) right.getNumColumns();
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
B.deallocate();
gCtx.cudaFreeHelper(BT);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
// Note the arguments to denseDenseMatmult to accommodate for this.
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int) left.getNumRows(), (int) left.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
// To allocate the dense matrix
boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
if (allocated && GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
Pointer C = getDensePointer(gCtx, output, instName);
denseDenseMatmult(gCtx, instName, C, (int) left.getNumColumns(), (int) left.getNumRows(), (int) right.getNumRows(), (int) right.getNumColumns(), !isLeftTransposed, isRightTransposed, ADenseTransposed, BDense);
gCtx.cudaFreeHelper(instName, ADenseTransposed);
}
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUContext method cudaFreeHelper.
/**
* Does cudaFree calls, lazily
*
* @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
* @param toFree {@link Pointer} instance to be freed
* @param eager true if to be done eagerly
*/
public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
Pointer dummy = new Pointer();
if (// trying to free a null pointer
toFree == dummy)
return;
long t0 = 0;
assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
long size = cudaBlockSizeMap.get(toFree);
if (eager) {
LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this);
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
cudaFree(toFree);
cudaBlockSizeMap.remove(toFree);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaDeAllocCount.addAndGet(1);
if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
} else {
LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
if (freeList == null) {
freeList = new LinkedList<Pointer>();
freeCUDASpaceMap.put(size, freeList);
}
if (freeList.contains(toFree))
throw new RuntimeException("GPU : Internal state corrupted, double free");
freeList.add(toFree);
}
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUContext method allocate.
/**
* Allocates temporary space on the device.
* Does not update bookkeeping.
* The caller is responsible for freeing up after usage.
*
* @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
* @param size Size of data (in bytes) to allocate
* @param statsCount amount to increment the cudaAllocCount by
* @return jcuda Pointer
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException {
long t0 = 0, t1 = 0, end = 0;
Pointer A;
if (freeCUDASpaceMap.containsKey(size)) {
LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this);
if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
A = freeList.pop();
if (freeList.isEmpty())
freeCUDASpaceMap.remove(size);
if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
} else {
LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
ensureFreeSpace(instructionName, size);
A = new Pointer();
cudaMalloc(A, size);
if (DMLScript.STATISTICS)
GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
}
// Set all elements to 0 since newly allocated space will contain garbage
if (DMLScript.STATISTICS)
t1 = System.nanoTime();
LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
cudaMemset(A, 0, size);
if (DMLScript.STATISTICS)
end = System.nanoTime();
if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
if (DMLScript.STATISTICS)
GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
if (DMLScript.STATISTICS)
GPUStatistics.cudaMemSet0Count.getAndAdd(1);
cudaBlockSizeMap.put(A, size);
return A;
}
use of jcuda.Pointer in project incubator-systemml by apache.
the class GPUContext method evict.
/**
* Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up
* or it fails.
* First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix
* blocks with zero locks on them is freed up.
* The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on
* number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up
* blocks on which there are zero locks until the required size has been freed up.
* // TODO: update it with hybrid policy
*
* @param instructionName name of the instruction for which performance measurements are made
* @param neededSize desired size to be freed up on the GPU
* @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them.
*/
protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
GPUStatistics.cudaEvictionCount.addAndGet(1);
// Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
// to free up space
LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
while (lruCacheMap.size() > 0) {
if (neededSize <= getAvailableMemory())
break;
Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
Long size = toFreeListPair.getKey();
Pointer toFree = toFreeList.pop();
if (toFreeList.isEmpty())
lruCacheMap.remove(size);
cudaFreeHelper(instructionName, toFree, true);
}
if (neededSize <= getAvailableMemory())
return;
if (allocatedGPUObjects.size() == 0) {
throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
}
Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
@Override
public int compare(GPUObject p1, GPUObject p2) {
long p1Val = p1.locks.get();
long p2Val = p2.locks.get();
if (p1Val > 0 && p2Val > 0) {
// Both are locked, so don't sort
return 0;
} else if (p1Val > 0 || p2Val > 0) {
// Put the unlocked one to RHS
return Long.compare(p2Val, p1Val);
} else {
if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
long p1Size = 0;
long p2Size = 0;
try {
p1Size = p1.getSizeOnDevice() - neededSize;
p2Size = p2.getSizeOnDevice() - neededSize;
} catch (DMLRuntimeException e) {
throw new RuntimeException(e);
}
if (p1Size >= 0 && p2Size >= 0) {
return Long.compare(p2Size, p1Size);
} else {
return Long.compare(p1Size, p2Size);
}
} else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
return Long.compare(p2.timestamp.get(), p1.timestamp.get());
} else {
throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
}
}
}
});
while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
if (toBeRemoved.locks.get() > 0) {
throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
}
if (toBeRemoved.dirty) {
toBeRemoved.copyFromDeviceToHost();
}
toBeRemoved.clearData(true);
}
}
Aggregations