Search in sources :

Example 21 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class ProtectedCudaConstantHandler method moveToConstantSpace.

/**
 * This method moves specified dataBuffer to CUDA constant memory space.
 *
 * PLEASE NOTE: CUDA constant memory is limited to 48KB per device.
 *
 * @param dataBuffer
 * @return
 */
@Override
public synchronized long moveToConstantSpace(DataBuffer dataBuffer) {
    // now, we move things to constant memory
    Integer deviceId = AtomicAllocator.getInstance().getDeviceId();
    ensureMaps(deviceId);
    AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(dataBuffer);
    long requiredMemoryBytes = AllocationUtils.getRequiredMemory(point.getShape());
    // logger.info("shape: " + point.getShape());
    // and release device memory :)
    long currentOffset = constantOffsets.get(deviceId).get();
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    if (currentOffset + requiredMemoryBytes >= MAX_CONSTANT_LENGTH || requiredMemoryBytes > MAX_BUFFER_LENGTH) {
        if (point.getAllocationStatus() == AllocationStatus.HOST && CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.DELAYED) {
            AtomicAllocator.getInstance().getMemoryHandler().alloc(AllocationStatus.DEVICE, point, point.getShape(), false);
        }
        if (NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
        flowController.commitTransfer(context.getSpecialStream());
        point.setConstant(true);
        point.tickDeviceWrite();
        point.tickHostRead();
        point.setDeviceId(deviceId);
        protector.persistDataBuffer(dataBuffer);
        return 0;
    }
    long bytes = requiredMemoryBytes;
    // hack for misalignment avoidance for 16bit data opType
    if (dataBuffer.dataType() == DataBuffer.Type.HALF) {
        if (bytes % 4 != 0) {
            bytes += 2;
        }
    } else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE || dataBuffer.dataType() == DataBuffer.Type.LONG) {
        // for double data opType, we must be assured, that all DOUBLE pointers are starting from even addresses, to avoid banks spills
        long div = bytes / 4;
        if (div % 2 != 0)
            bytes += 4;
        // for possible changes of dtype in the same jvm, we skip few bytes in constant memory
        div = currentOffset / 4;
        while (div % 2 != 0) {
            currentOffset = constantOffsets.get(deviceId).addAndGet(4);
            div = currentOffset / 4;
            // just break out, if we're stepped beyond constant memory space
            if (currentOffset > MAX_CONSTANT_LENGTH)
                break;
        }
    }
    currentOffset = constantOffsets.get(deviceId).getAndAdd(bytes);
    if (currentOffset >= MAX_CONSTANT_LENGTH) {
        if (point.getAllocationStatus() == AllocationStatus.HOST && CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.DELAYED) {
            AtomicAllocator.getInstance().getMemoryHandler().alloc(AllocationStatus.DEVICE, point, point.getShape(), false);
        }
        if (NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
        flowController.commitTransfer(context.getSpecialStream());
        point.setConstant(true);
        point.tickDeviceWrite();
        point.tickHostRead();
        point.setDeviceId(deviceId);
        protector.persistDataBuffer(dataBuffer);
        return 0;
    }
    NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyConstantAsync(currentOffset, point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream());
    flowController.commitTransfer(context.getSpecialStream());
    long cAddr = deviceAddresses.get(deviceId).address() + currentOffset;
    // if (resetHappened)
    // logger.info("copying to constant: {}, bufferLength: {}, bufferDtype: {}, currentOffset: {}, currentAddres: {}", requiredMemoryBytes, dataBuffer.length(), dataBuffer.dataType(), currentOffset, cAddr);
    point.setAllocationStatus(AllocationStatus.CONSTANT);
    point.getPointers().setDevicePointer(new CudaPointer(cAddr));
    point.setConstant(true);
    point.tickDeviceWrite();
    point.setDeviceId(deviceId);
    point.tickHostRead();
    protector.persistDataBuffer(dataBuffer);
    return cAddr;
}
Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 22 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpyAsync.

/**
 * Asynchronous version of memcpy
 *
 * PLEASE NOTE: This is device-dependent method, if it's not supported in your environment, blocking call will be used instead.
 *
 * @param dstBuffer
 * @param srcPointer
 * @param length
 * @param dstOffset
 */
@Override
public void memcpyAsync(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
    AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    // we update host memory regardless.
    // Pointer dP = new Pointer((point.getAllocationStatus() == AllocationStatus.DEVICE ? point.getPointers().getDevicePointer().address() : point.getPointers().getHostPointer().address()) + dstOffset);
    Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
    // Pointer sP = new Pointer(srcPointer.getNativePointer());
    // log.info("Location: " + point.getAllocationStatus());
    // if (length > 4)
    // log.info("memcpyAsync:  ["+ srcPointer.getNativePointer()+"] -> ["+ dP.getNativePointer()+"], length: [" + length+ "], offset: ["+ dstOffset+"], dstBufferOffset: ["+(dstBuffer.getElementSize() * dstBuffer.offset()) + "/" + dstBuffer.offset() +"]");
    CudaContext tContext = null;
    if (dstBuffer.isConstant()) {
        org.bytedeco.javacpp.Pointer dstPointer = new CudaPointer(point.getPointers().getHostPointer().address() + dstOffset, 0L);
        org.bytedeco.javacpp.Pointer srcPointerJ = new CudaPointer(srcPointer, length);
        // log.info("JCPP Memcpy: [{}] -> [{}], length: [{}]", srcPointerJ.address(), dstPointer.address(), length);
        org.bytedeco.javacpp.Pointer.memcpy(dstPointer, srcPointerJ, length);
        point.tickHostRead();
    } else {
        // log.info("Memcpy pointers: [{}] -> [{}]", srcPointer.address(),  dP.address());
        CudaContext context = flowController.prepareAction(point);
        tContext = context;
        if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync H2H failed: [" + srcPointer.address() + "] -> [" + dP.address() + "]");
        flowController.commitTransfer(tContext.getSpecialStream());
        if (point.getAllocationStatus() == AllocationStatus.HOST)
            flowController.registerAction(context, point);
    }
    // if we're copying something into host memory, but we're on device - we need to provide exact copy to device as well
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        // TODO: this sounds wrong, and probably memcpy whould check initial direction, like relocate did before
        Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
        if (tContext == null)
            tContext = flowController.prepareAction(point);
        if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, tContext.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync H2D failed: [" + dP.address() + "] -> [" + rDP.address() + "]");
        flowController.commitTransfer(tContext.getSpecialStream());
        flowController.registerAction(tContext, point);
    }
    point.tickDeviceWrite();
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) Pointer(org.bytedeco.javacpp.Pointer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 23 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method getHostPointer.

/**
 * PLEASE NOTE: This method always returns pointer within OS memory space
 *
 * @param buffer
 * @return
 */
@Override
public org.bytedeco.javacpp.Pointer getHostPointer(DataBuffer buffer) {
    AllocationPoint dstPoint = ((BaseCudaDataBuffer) buffer).getAllocationPoint();
    // return pointer with offset if needed. length is specified for constructor compatibility purposes
    if (dstPoint.getPointers().getHostPointer() == null) {
        log.info("DevicePointer: " + dstPoint.getPointers().getDevicePointer());
        log.info("HostPointer: " + dstPoint.getPointers().getHostPointer());
        log.info("AllocStatus: " + dstPoint.getAllocationStatus());
        throw new RuntimeException("pointer is null");
    }
    // dstPoint.tickHostWrite();
    // dstPoint.tickHostRead();
    // log.info("Requesting host pointer for {}", buffer);
    // getCudaContext().syncOldStream();
    synchronizeThreadDevice(Thread.currentThread().getId(), dstPoint.getDeviceId(), dstPoint);
    CudaPointer p = new CudaPointer(dstPoint.getPointers().getHostPointer(), buffer.length(), (buffer.offset() * buffer.getElementSize()));
    switch(buffer.dataType()) {
        case DOUBLE:
            return p.asDoublePointer();
        case FLOAT:
            return p.asFloatPointer();
        case INT:
            return p.asIntPointer();
        case HALF:
            return p.asShortPointer();
        default:
            return p;
    }
}
Also used : BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 24 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpyDevice.

@Override
public void memcpyDevice(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset, CudaContext context) {
    // log.info("Memcpy device: {} bytes ", length);
    AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    Pointer dP = new CudaPointer((point.getPointers().getDevicePointer().address()) + dstOffset);
    if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyDeviceToDevice, context.getOldStream()) == 0)
        throw new ND4JIllegalStateException("memcpyAsync failed");
    point.tickDeviceWrite();
}
Also used : BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 25 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method relocate.

/**
 * Copies specific chunk of memory from one storage to another
 *
 * Possible directions:  HOST -> DEVICE, DEVICE -> HOST
 *
 * @param currentStatus
 * @param targetStatus
 * @param point
 */
@Override
public void relocate(AllocationStatus currentStatus, AllocationStatus targetStatus, AllocationPoint point, AllocationShape shape, CudaContext context) {
    if (currentStatus == AllocationStatus.DEVICE && targetStatus == AllocationStatus.HOST) {
        // DEVICE -> HOST
        DataBuffer targetBuffer = point.getBuffer();
        if (targetBuffer == null)
            throw new IllegalStateException("Target buffer is NULL!");
        Pointer devicePointer = new CudaPointer(point.getPointers().getDevicePointer().address());
    } else if (currentStatus == AllocationStatus.HOST && targetStatus == AllocationStatus.DEVICE) {
        // TODO: this probably should be removed
        if (point.isConstant()) {
            // log.info("Skipping relocation for constant");
            return;
        }
        if (point.getPointers().getDevicePointer() == null) {
            throw new IllegalStateException("devicePointer is NULL!");
        }
        if (nativeOps.memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), AllocationUtils.getRequiredMemory(shape), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync relocate H2D failed: [" + point.getHostPointer().address() + "] -> [" + point.getDevicePointer().address() + "]");
        flowController.commitTransfer(context.getSpecialStream());
    // 
    // 
    // context.syncOldStream();
    } else
        throw new UnsupportedOperationException("Can't relocate data in requested direction: [" + currentStatus + "] -> [" + targetStatus + "]");
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)47 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)27 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)20 Pointer (org.bytedeco.javacpp.Pointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)12 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)10 FloatPointer (org.bytedeco.javacpp.FloatPointer)10 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)4 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4