use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class CudaZeroHandler method relocate.
/**
* Copies specific chunk of memory from one storage to another
*
* Possible directions: HOST -> DEVICE, DEVICE -> HOST
*
* @param currentStatus
* @param targetStatus
* @param point
*/
@Override
public void relocate(AllocationStatus currentStatus, AllocationStatus targetStatus, AllocationPoint point, AllocationShape shape, CudaContext context) {
if (currentStatus == AllocationStatus.DEVICE && targetStatus == AllocationStatus.HOST) {
// DEVICE -> HOST
DataBuffer targetBuffer = point.getBuffer();
if (targetBuffer == null)
throw new IllegalStateException("Target buffer is NULL!");
Pointer devicePointer = new CudaPointer(point.getPointers().getDevicePointer().address());
} else if (currentStatus == AllocationStatus.HOST && targetStatus == AllocationStatus.DEVICE) {
// TODO: this probably should be removed
if (point.isConstant()) {
// log.info("Skipping relocation for constant");
return;
}
if (point.getPointers().getDevicePointer() == null) {
throw new IllegalStateException("devicePointer is NULL!");
}
if (nativeOps.memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), AllocationUtils.getRequiredMemory(shape), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream()) == 0)
throw new IllegalStateException("MemcpyAsync relocate H2D failed: [" + point.getHostPointer().address() + "] -> [" + point.getDevicePointer().address() + "]");
flowController.commitTransfer(context.getSpecialStream());
//
//
// context.syncOldStream();
} else
throw new UnsupportedOperationException("Can't relocate data in requested direction: [" + currentStatus + "] -> [" + targetStatus + "]");
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class CudaWorkspace method init.
@Override
protected void init() {
if (workspaceConfiguration.getPolicyLocation() == LocationPolicy.MMAP) {
throw new ND4JIllegalStateException("CUDA do not support MMAP workspaces yet");
}
super.init();
if (currentSize.get() > 0) {
// log.info("Allocating {} bytes at DEVICE & HOST space...", currentSize.get());
isInit.set(true);
long bytes = currentSize.get();
if (isDebug.get())
log.info("Allocating [{}] workspace on device_{}, {} bytes...", id, Nd4j.getAffinityManager().getDeviceForCurrentThread(), bytes);
if (isDebug.get()) {
Nd4j.getWorkspaceManager().printAllocationStatisticsForCurrentThread();
}
Pointer ptr = memoryManager.allocate((bytes + SAFETY_OFFSET), MemoryKind.HOST, false);
if (ptr == null)
throw new ND4JIllegalStateException("Can't allocate memory for workspace");
workspace.setHostPointer(new PagedPointer(ptr));
if (workspaceConfiguration.getPolicyMirroring() != MirroringPolicy.HOST_ONLY)
workspace.setDevicePointer(new PagedPointer(memoryManager.allocate((bytes + SAFETY_OFFSET), MemoryKind.DEVICE, false)));
// log.info("Workspace [{}] initialized successfully", id);
}
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class CudaMemoryManager method memcpy.
/**
* This method provides basic memcpy functionality with respect to target environment
*
* @param dstBuffer
* @param srcBuffer
*/
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
if (dstBuffer instanceof CompressedDataBuffer && !(srcBuffer instanceof CompressedDataBuffer)) {
// destination is compressed, source isn't
AllocationPoint srcPoint = AtomicAllocator.getInstance().getAllocationPoint(srcBuffer);
long size = srcBuffer.getElementSize() * srcBuffer.length();
if (!srcPoint.isActualOnHostSide()) {
// copying device -> host
AtomicAllocator.getInstance().synchronizeHostData(srcBuffer);
// Pointer src = AtomicAllocator.getInstance().getPointer(srcBuffer, context);
// NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstBuffer.addressPointer(), src, size, 2, context.getSpecialStream());
// context.syncSpecialStream();
}
// else {
// copying host -> host
Pointer src = AtomicAllocator.getInstance().getHostPointer(srcBuffer);
Pointer.memcpy(dstBuffer.addressPointer(), src, size);
// }
} else if (!(dstBuffer instanceof CompressedDataBuffer) && srcBuffer instanceof CompressedDataBuffer) {
// destination is NOT compressed, source is compressed
AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(dstBuffer);
long size = srcBuffer.getElementSize() * srcBuffer.length();
Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), size);
dstPoint.tickHostWrite();
} else if (dstBuffer instanceof CompressedDataBuffer && srcBuffer instanceof CompressedDataBuffer) {
// both buffers are compressed, just fire memcpy
Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), srcBuffer.length() * srcBuffer.getElementSize());
} else {
// both buffers are NOT compressed
AtomicAllocator.getInstance().memcpy(dstBuffer, srcBuffer);
}
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class CudaFullCachingProvider method malloc.
/**
* This method provides PointersPair to memory chunk specified by AllocationShape
*
* PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
*
* @param shape shape of desired memory chunk
* @param point target AllocationPoint structure
* @param location either HOST or DEVICE
* @return
*/
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
long reqMemory = AllocationUtils.getRequiredMemory(shape);
if (location == AllocationStatus.DEVICE && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceAllocation()) {
int deviceId = AtomicAllocator.getInstance().getDeviceId();
ensureDeviceCacheHolder(deviceId, shape);
CacheHolder cache = deviceCache.get(deviceId).get(shape);
if (cache != null) {
Pointer pointer = cache.poll();
if (pointer != null) {
cacheDeviceHit.incrementAndGet();
deviceCachedAmount.get(deviceId).addAndGet(-1 * reqMemory);
PointersPair pair = new PointersPair();
pair.setDevicePointer(pointer);
point.setAllocationStatus(AllocationStatus.DEVICE);
point.setDeviceId(deviceId);
return pair;
}
}
cacheDeviceMiss.incrementAndGet();
return super.malloc(shape, point, location);
}
return super.malloc(shape, point, location);
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class JcublasLapack method sgeqrf.
// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
INDArray a = A;
INDArray r = R;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT getrf called in DOUBLE environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (R != null && R.ordering() == 'c')
r = R.dup('f');
INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new IllegalStateException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xTauPointer = new CublasPointer(tau, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual QR decomp
stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgeqrf failed", stat);
}
allocator.registerAction(ctx, a);
// allocator.registerAction(ctx, tau);
allocator.registerAction(ctx, INFO);
if (INFO.getInt(0) != 0) {
throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
}
// Copy R ( upper part of Q ) into result
if (r != null) {
r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(0, i);
r.put(ix, 0);
}
}
stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
worksize = worksizeBuffer.getInt(0);
workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSorgqr failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
if (a != A)
A.assign(a);
if (r != null && r != R)
R.assign(r);
log.info("A: {}", A);
if (R != null)
log.info("R: {}", R);
}
Aggregations