use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class JcublasLapack method sgetrf.
@Override
public void sgetrf(int M, int N, INDArray A, INDArray IPIV, INDArray INFO) {
INDArray a = A;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT getrf called in DOUBLE environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgetrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgetrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual LU decomp
stat = cusolverDnSgetrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(workspace).asFloatPointer(), new CudaPointer(allocator.getPointer(IPIV, ctx)).asIntPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgetrf failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
allocator.registerAction(ctx, IPIV);
if (a != A)
A.assign(a);
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaAffinityManager method replicateToDevice.
/**
* This method replicates given INDArray, and places it to target device.
*
* @param deviceId target deviceId
* @param array INDArray to replicate
* @return
*/
@Override
public synchronized INDArray replicateToDevice(Integer deviceId, INDArray array) {
if (array == null)
return null;
if (array.isView())
throw new UnsupportedOperationException("It's impossible to replicate View");
int[] shape = array.shape();
int[] stride = array.stride();
int elementWiseStride = array.elementWiseStride();
char ordering = array.ordering();
int length = array.length();
// we use this call to get device memory updated
AtomicAllocator.getInstance().getPointer(array, (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext());
int currentDeviceId = getDeviceForCurrentThread();
NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(deviceId));
attachThreadToDevice(Thread.currentThread().getId(), deviceId);
DataBuffer newDataBuffer = replicateToDevice(deviceId, array.data());
DataBuffer newShapeBuffer = Nd4j.getShapeInfoProvider().createShapeInformation(shape, stride, 0, elementWiseStride, ordering).getFirst();
INDArray result = Nd4j.createArrayFromShapeBuffer(newDataBuffer, newShapeBuffer);
attachThreadToDevice(Thread.currentThread().getId(), currentDeviceId);
NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(currentDeviceId));
return result;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaAffinityManager method replicateToDevice.
/**
* This method replicates given DataBuffer, and places it to target device.
*
* @param deviceId target deviceId
* @param buffer
* @return
*/
@Override
public DataBuffer replicateToDevice(Integer deviceId, DataBuffer buffer) {
if (buffer == null)
return null;
int currentDeviceId = AtomicAllocator.getInstance().getDeviceId();
if (currentDeviceId != deviceId) {
NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(deviceId));
Nd4j.getAffinityManager().attachThreadToDevice(Thread.currentThread().getId(), deviceId);
}
DataBuffer dstBuffer = Nd4j.createBuffer(buffer.length(), false);
AtomicAllocator.getInstance().memcpy(dstBuffer, buffer);
if (currentDeviceId != deviceId) {
NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(currentDeviceId));
Nd4j.getAffinityManager().attachThreadToDevice(Thread.currentThread().getId(), currentDeviceId);
}
return dstBuffer;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaAffinityManager method getDeviceForThread.
/**
* This method returns deviceId for given thread, identified by threadId
*
* If no device was assigned to this thread before this call, it'll be assinged here.
*
* @param threadId
* @return
*/
@Override
public Integer getDeviceForThread(long threadId) {
if (getNumberOfDevices() == 1)
return 0;
Integer aff = affinityMap.get(threadId);
if (aff == null) {
Integer deviceId = getNextDevice(threadId);
affinityMap.put(threadId, deviceId);
affiliated.set(new AtomicBoolean(false));
if (threadId == Thread.currentThread().getId()) {
NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(deviceId));
// logger.error("setDevice({}) called for thread {}", deviceId, Thread.currentThread().getName());
affiliated.get().set(true);
}
return deviceId;
} else {
if (threadId == Thread.currentThread().getId()) {
if (affiliated.get() == null)
affiliated.set(new AtomicBoolean(false));
if (!affiliated.get().get()) {
NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(aff));
// logger.error("SCARY setDevice({}) called for thread {}", aff, threadId);
affiliated.get().set(true);
return aff;
}
}
return aff;
}
/*
return affinityMap.get(threadId);
*/
// return 0;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class BasicContextPool method getDeviceBuffers.
/**
* This method is used to allocate
* @param context
* @param deviceId
*/
protected void getDeviceBuffers(CudaContext context, int deviceId) {
// ((CudaExecutioner) Nd4j.getExecutioner()).getNativeOps();
NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps();
// we hardcode sizeOf to sizeOf(double)
int sizeOf = 8;
Pointer reductionPointer = nativeOps.mallocDevice(16385 * sizeOf * 2, new CudaPointer(deviceId), 0);
if (reductionPointer == null)
throw new IllegalStateException("Can't allocate [DEVICE] reduction buffer memory!");
nativeOps.memsetAsync(reductionPointer, 0, 16385 * sizeOf * 2, 0, context.getOldStream());
context.syncOldStream();
Pointer allocationPointer = nativeOps.mallocDevice(1024 * 1024, new CudaPointer(deviceId), 0);
if (allocationPointer == null)
throw new IllegalStateException("Can't allocate [DEVICE] allocation buffer memory!");
Pointer scalarPointer = nativeOps.mallocHost(1 * sizeOf, 0);
if (scalarPointer == null)
throw new IllegalStateException("Can't allocate [HOST] scalar buffer memory!");
context.setBufferScalar(scalarPointer);
context.setBufferAllocation(allocationPointer);
context.setBufferReduction(reductionPointer);
Pointer specialPointer = nativeOps.mallocDevice(1024 * 1024 * sizeOf, new CudaPointer(deviceId), 0);
if (specialPointer == null)
throw new IllegalStateException("Can't allocate [DEVICE] special buffer memory!");
nativeOps.memsetAsync(specialPointer, 0, 65536 * sizeOf, 0, context.getOldStream());
context.setBufferSpecial(specialPointer);
}
Aggregations