Search in sources :

Example 41 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class JcublasLapack method sgetrf.

@Override
public void sgetrf(int M, int N, INDArray A, INDArray IPIV, INDArray INFO) {
    INDArray a = A;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT getrf called in DOUBLE environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgetrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgetrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual LU decomp
        stat = cusolverDnSgetrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(workspace).asFloatPointer(), new CudaPointer(allocator.getPointer(IPIV, ctx)).asIntPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgetrf failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    allocator.registerAction(ctx, IPIV);
    if (a != A)
        A.assign(a);
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 42 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaAffinityManager method replicateToDevice.

/**
 * This method replicates given INDArray, and places it to target device.
 *
 * @param deviceId target deviceId
 * @param array    INDArray to replicate
 * @return
 */
@Override
public synchronized INDArray replicateToDevice(Integer deviceId, INDArray array) {
    if (array == null)
        return null;
    if (array.isView())
        throw new UnsupportedOperationException("It's impossible to replicate View");
    int[] shape = array.shape();
    int[] stride = array.stride();
    int elementWiseStride = array.elementWiseStride();
    char ordering = array.ordering();
    int length = array.length();
    // we use this call to get device memory updated
    AtomicAllocator.getInstance().getPointer(array, (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext());
    int currentDeviceId = getDeviceForCurrentThread();
    NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(deviceId));
    attachThreadToDevice(Thread.currentThread().getId(), deviceId);
    DataBuffer newDataBuffer = replicateToDevice(deviceId, array.data());
    DataBuffer newShapeBuffer = Nd4j.getShapeInfoProvider().createShapeInformation(shape, stride, 0, elementWiseStride, ordering).getFirst();
    INDArray result = Nd4j.createArrayFromShapeBuffer(newDataBuffer, newShapeBuffer);
    attachThreadToDevice(Thread.currentThread().getId(), currentDeviceId);
    NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(currentDeviceId));
    return result;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 43 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaAffinityManager method replicateToDevice.

/**
 * This method replicates given DataBuffer, and places it to target device.
 *
 * @param deviceId target deviceId
 * @param buffer
 * @return
 */
@Override
public DataBuffer replicateToDevice(Integer deviceId, DataBuffer buffer) {
    if (buffer == null)
        return null;
    int currentDeviceId = AtomicAllocator.getInstance().getDeviceId();
    if (currentDeviceId != deviceId) {
        NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(deviceId));
        Nd4j.getAffinityManager().attachThreadToDevice(Thread.currentThread().getId(), deviceId);
    }
    DataBuffer dstBuffer = Nd4j.createBuffer(buffer.length(), false);
    AtomicAllocator.getInstance().memcpy(dstBuffer, buffer);
    if (currentDeviceId != deviceId) {
        NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(currentDeviceId));
        Nd4j.getAffinityManager().attachThreadToDevice(Thread.currentThread().getId(), currentDeviceId);
    }
    return dstBuffer;
}
Also used : AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 44 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaAffinityManager method getDeviceForThread.

/**
 * This method returns deviceId for given thread, identified by threadId
 *
 * If no device was assigned to this thread before this call, it'll be assinged here.
 *
 * @param threadId
 * @return
 */
@Override
public Integer getDeviceForThread(long threadId) {
    if (getNumberOfDevices() == 1)
        return 0;
    Integer aff = affinityMap.get(threadId);
    if (aff == null) {
        Integer deviceId = getNextDevice(threadId);
        affinityMap.put(threadId, deviceId);
        affiliated.set(new AtomicBoolean(false));
        if (threadId == Thread.currentThread().getId()) {
            NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(deviceId));
            // logger.error("setDevice({}) called for thread {}", deviceId, Thread.currentThread().getName());
            affiliated.get().set(true);
        }
        return deviceId;
    } else {
        if (threadId == Thread.currentThread().getId()) {
            if (affiliated.get() == null)
                affiliated.set(new AtomicBoolean(false));
            if (!affiliated.get().get()) {
                NativeOpsHolder.getInstance().getDeviceNativeOps().setDevice(new CudaPointer(aff));
                // logger.error("SCARY setDevice({}) called for thread {}", aff, threadId);
                affiliated.get().set(true);
                return aff;
            }
        }
        return aff;
    }
/*


        return affinityMap.get(threadId);
*/
// return 0;
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 45 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class BasicContextPool method getDeviceBuffers.

/**
 * This method is used to allocate
 * @param context
 * @param deviceId
 */
protected void getDeviceBuffers(CudaContext context, int deviceId) {
    // ((CudaExecutioner) Nd4j.getExecutioner()).getNativeOps();
    NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps();
    // we hardcode sizeOf to sizeOf(double)
    int sizeOf = 8;
    Pointer reductionPointer = nativeOps.mallocDevice(16385 * sizeOf * 2, new CudaPointer(deviceId), 0);
    if (reductionPointer == null)
        throw new IllegalStateException("Can't allocate [DEVICE] reduction buffer memory!");
    nativeOps.memsetAsync(reductionPointer, 0, 16385 * sizeOf * 2, 0, context.getOldStream());
    context.syncOldStream();
    Pointer allocationPointer = nativeOps.mallocDevice(1024 * 1024, new CudaPointer(deviceId), 0);
    if (allocationPointer == null)
        throw new IllegalStateException("Can't allocate [DEVICE] allocation buffer memory!");
    Pointer scalarPointer = nativeOps.mallocHost(1 * sizeOf, 0);
    if (scalarPointer == null)
        throw new IllegalStateException("Can't allocate [HOST] scalar buffer memory!");
    context.setBufferScalar(scalarPointer);
    context.setBufferAllocation(allocationPointer);
    context.setBufferReduction(reductionPointer);
    Pointer specialPointer = nativeOps.mallocDevice(1024 * 1024 * sizeOf, new CudaPointer(deviceId), 0);
    if (specialPointer == null)
        throw new IllegalStateException("Can't allocate [DEVICE] special buffer memory!");
    nativeOps.memsetAsync(specialPointer, 0, 65536 * sizeOf, 0, context.getOldStream());
    context.setBufferSpecial(specialPointer);
}
Also used : NativeOps(org.nd4j.nativeblas.NativeOps) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Aggregations

CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)47 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)27 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)20 Pointer (org.bytedeco.javacpp.Pointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)12 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)10 FloatPointer (org.bytedeco.javacpp.FloatPointer)10 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)4 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4