Search in sources :

Example 1 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class ProtectedCudaConstantHandler method ensureMaps.

private void ensureMaps(Integer deviceId) {
    if (!buffersCache.containsKey(deviceId)) {
        if (flowController == null)
            flowController = AtomicAllocator.getInstance().getFlowController();
        try {
            synchronized (this) {
                if (!buffersCache.containsKey(deviceId)) {
                    // TODO: this op call should be checked
                    // nativeOps.setDevice(new CudaPointer(deviceId));
                    buffersCache.put(deviceId, new ConcurrentHashMap<ArrayDescriptor, DataBuffer>());
                    constantOffsets.put(deviceId, new AtomicLong(0));
                    deviceLocks.put(deviceId, new Semaphore(1));
                    Pointer cAddr = NativeOpsHolder.getInstance().getDeviceNativeOps().getConstantSpace();
                    // logger.info("constant pointer: {}", cAddr.address() );
                    deviceAddresses.put(deviceId, cAddr);
                }
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
}
Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) ArrayDescriptor(org.nd4j.linalg.cache.ArrayDescriptor) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) Semaphore(java.util.concurrent.Semaphore) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) CudaHalfDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaHalfDataBuffer) CudaFloatDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaFloatDataBuffer) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)

Example 2 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpySpecial.

/**
 * Special memcpy version, addressing shapeInfoDataBuffer copies
 *
 * PLEASE NOTE: Blocking H->H, Async H->D
 *
 * @param dstBuffer
 * @param srcPointer
 * @param length
 * @param dstOffset
 */
@Override
public void memcpySpecial(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
    // log.info("Memcpy special: {} bytes ", length);
    CudaContext context = getCudaContext();
    AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    // context.syncOldStream();
    Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
    if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getOldStream()) == 0)
        throw new ND4JIllegalStateException("memcpyAsync failed");
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
        if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0)
            throw new ND4JIllegalStateException("memcpyAsync failed");
        context.syncOldStream();
    }
    context.syncOldStream();
    point.tickDeviceWrite();
// point.tickHostRead();
}
Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 3 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpy.

/**
 *  Synchronous version of memcpy.
 *
 * @param dstBuffer
 * @param srcBuffer
 */
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
    // log.info("Buffer MemCpy called");
    // log.info("Memcpy buffer: {} bytes ", dstBuffer.length() * dstBuffer.getElementSize());
    CudaContext context = getCudaContext();
    AllocationPoint dstPoint = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    AllocationPoint srcPoint = ((BaseCudaDataBuffer) srcBuffer).getAllocationPoint();
    Pointer dP = new CudaPointer(dstPoint.getPointers().getHostPointer().address());
    Pointer sP = null;
    if (srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        sP = new CudaPointer(srcPoint.getPointers().getDevicePointer().address());
        /*
            JCuda.cudaMemcpyAsync(
                    dP,
                    sP,
                    srcBuffer.length(),
                    cudaMemcpyKind.cudaMemcpyHostToDevice,
                    context.getOldStream()
            );*/
        if (nativeOps.memcpyAsync(dP, sP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
    } else {
        sP = new CudaPointer(srcPoint.getPointers().getHostPointer().address());
        /*
            JCuda.cudaMemcpyAsync(
                    dP,
                    sP,
                    srcBuffer.length(),
                    cudaMemcpyKind.cudaMemcpyHostToDevice,
                    context.getOldStream()
            );*/
        if (nativeOps.memcpyAsync(dP, sP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
    }
    if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        Pointer rDP = new CudaPointer(dstPoint.getPointers().getDevicePointer().address());
        /*
            JCuda.cudaMemcpyAsync(
                    rDP,
                    dP,
                    srcBuffer.length(),
                    cudaMemcpyKind.cudaMemcpyHostToDevice,
                    context.getOldStream()
            );*/
        if (nativeOps.memcpyAsync(rDP, dP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
    }
    dstPoint.tickDeviceWrite();
    // it has to be blocking call
    context.syncOldStream();
}
Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 4 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method getDevicePointer.

/**
 * PLEASE NOTE: Specific implementation, on systems without special devices can return HostPointer here
 *
 * @param buffer
 * @return
 */
@Override
public org.bytedeco.javacpp.Pointer getDevicePointer(DataBuffer buffer, CudaContext context) {
    // TODO: It would be awesome to get rid of typecasting here
    // getCudaContext().syncOldStream();
    AllocationPoint dstPoint = ((BaseCudaDataBuffer) buffer).getAllocationPoint();
    // here's the place, where we do care about promotion. but we only care about promotion of original  buffers
    if (dstPoint.getAllocationStatus() == AllocationStatus.HOST && buffer.offset() == 0 && 1 < 0) {
        if (dstPoint.getDeviceTicks() > configuration.getMinimumRelocationThreshold()) {
            // at this point we know, that this request is done withing some existent context
            long requiredMemory = AllocationUtils.getRequiredMemory(dstPoint.getShape());
            if (deviceMemoryTracker.reserveAllocationIfPossible(Thread.currentThread().getId(), getDeviceId(), requiredMemory) && pingDeviceForFreeMemory(getDeviceId(), requiredMemory)) {
                // so, memory is reserved
                promoteObject(buffer);
            }
        }
    }
    // if that's device state, we probably might want to update device memory state
    if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        if (!dstPoint.isActualOnDeviceSide()) {
            // log.info("Relocating to GPU");
            relocate(AllocationStatus.HOST, AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), context);
        } else {
        // log.info("Buffer is actual on device side: " + dstPoint.getShape());
        }
    }
    // else log.info("Not on [DEVICE]");
    // we update memory use counter, to announce that it's somehow used on device
    dstPoint.tickDeviceRead();
    // return pointer with offset if needed. length is specified for constructor compatibility purposes
    CudaPointer p = new CudaPointer(dstPoint.getPointers().getDevicePointer(), buffer.length(), (buffer.offset() * buffer.getElementSize()));
    switch(buffer.dataType()) {
        case DOUBLE:
            return p.asDoublePointer();
        case FLOAT:
            return p.asFloatPointer();
        case INT:
            return p.asIntPointer();
        case HALF:
            return p.asShortPointer();
        default:
            return p;
    }
}
Also used : BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 5 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaDirectProvider method freeDevice.

protected void freeDevice(Pointer pointer, int deviceId) {
    NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps();
    nativeOps.freeDevice(pointer, new CudaPointer(0));
}
Also used : NativeOps(org.nd4j.nativeblas.NativeOps) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Aggregations

CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)47 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)27 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)20 Pointer (org.bytedeco.javacpp.Pointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)12 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)10 FloatPointer (org.bytedeco.javacpp.FloatPointer)10 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)4 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4