Examples with BaseCudaDataBuffer - org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer

Example 1 with BaseCudaDataBuffer

use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpySpecial.

/**
 * Special memcpy version, addressing shapeInfoDataBuffer copies
 *
 * PLEASE NOTE: Blocking H->H, Async H->D
 *
 * @param dstBuffer
 * @param srcPointer
 * @param length
 * @param dstOffset
 */
@Override
public void memcpySpecial(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
    // log.info("Memcpy special: {} bytes ", length);
    CudaContext context = getCudaContext();
    AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    // context.syncOldStream();
    Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
    if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getOldStream()) == 0)
        throw new ND4JIllegalStateException("memcpyAsync failed");
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
        if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0)
            throw new ND4JIllegalStateException("memcpyAsync failed");
        context.syncOldStream();
    }
    context.syncOldStream();
    point.tickDeviceWrite();
// point.tickHostRead();
}

Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 2 with BaseCudaDataBuffer

use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpy.

/**
 *  Synchronous version of memcpy.
 *
 * @param dstBuffer
 * @param srcBuffer
 */
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
    // log.info("Buffer MemCpy called");
    // log.info("Memcpy buffer: {} bytes ", dstBuffer.length() * dstBuffer.getElementSize());
    CudaContext context = getCudaContext();
    AllocationPoint dstPoint = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    AllocationPoint srcPoint = ((BaseCudaDataBuffer) srcBuffer).getAllocationPoint();
    Pointer dP = new CudaPointer(dstPoint.getPointers().getHostPointer().address());
    Pointer sP = null;
    if (srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        sP = new CudaPointer(srcPoint.getPointers().getDevicePointer().address());
        /*
            JCuda.cudaMemcpyAsync(
                    dP,
                    sP,
                    srcBuffer.length(),
                    cudaMemcpyKind.cudaMemcpyHostToDevice,
                    context.getOldStream()
            );*/
        if (nativeOps.memcpyAsync(dP, sP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
    } else {
        sP = new CudaPointer(srcPoint.getPointers().getHostPointer().address());
        /*
            JCuda.cudaMemcpyAsync(
                    dP,
                    sP,
                    srcBuffer.length(),
                    cudaMemcpyKind.cudaMemcpyHostToDevice,
                    context.getOldStream()
            );*/
        if (nativeOps.memcpyAsync(dP, sP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
    }
    if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        Pointer rDP = new CudaPointer(dstPoint.getPointers().getDevicePointer().address());
        /*
            JCuda.cudaMemcpyAsync(
                    rDP,
                    dP,
                    srcBuffer.length(),
                    cudaMemcpyKind.cudaMemcpyHostToDevice,
                    context.getOldStream()
            );*/
        if (nativeOps.memcpyAsync(rDP, dP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
    }
    dstPoint.tickDeviceWrite();
    // it has to be blocking call
    context.syncOldStream();
}

Example 3 with BaseCudaDataBuffer

use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.

the class CudaZeroHandler method getDevicePointer.

/**
 * PLEASE NOTE: Specific implementation, on systems without special devices can return HostPointer here
 *
 * @param buffer
 * @return
 */
@Override
public org.bytedeco.javacpp.Pointer getDevicePointer(DataBuffer buffer, CudaContext context) {
    // TODO: It would be awesome to get rid of typecasting here
    // getCudaContext().syncOldStream();
    AllocationPoint dstPoint = ((BaseCudaDataBuffer) buffer).getAllocationPoint();
    // here's the place, where we do care about promotion. but we only care about promotion of original  buffers
    if (dstPoint.getAllocationStatus() == AllocationStatus.HOST && buffer.offset() == 0 && 1 < 0) {
        if (dstPoint.getDeviceTicks() > configuration.getMinimumRelocationThreshold()) {
            // at this point we know, that this request is done withing some existent context
            long requiredMemory = AllocationUtils.getRequiredMemory(dstPoint.getShape());
            if (deviceMemoryTracker.reserveAllocationIfPossible(Thread.currentThread().getId(), getDeviceId(), requiredMemory) && pingDeviceForFreeMemory(getDeviceId(), requiredMemory)) {
                // so, memory is reserved
                promoteObject(buffer);
            }
        }
    }
    // if that's device state, we probably might want to update device memory state
    if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        if (!dstPoint.isActualOnDeviceSide()) {
            // log.info("Relocating to GPU");
            relocate(AllocationStatus.HOST, AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), context);
        } else {
        // log.info("Buffer is actual on device side: " + dstPoint.getShape());
        }
    }
    // else log.info("Not on [DEVICE]");
    // we update memory use counter, to announce that it's somehow used on device
    dstPoint.tickDeviceRead();
    // return pointer with offset if needed. length is specified for constructor compatibility purposes
    CudaPointer p = new CudaPointer(dstPoint.getPointers().getDevicePointer(), buffer.length(), (buffer.offset() * buffer.getElementSize()));
    switch(buffer.dataType()) {
        case DOUBLE:
            return p.asDoublePointer();
        case FLOAT:
            return p.asFloatPointer();
        case INT:
            return p.asIntPointer();
        case HALF:
            return p.asShortPointer();
        default:
            return p;
    }
}

Also used : BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 4 with BaseCudaDataBuffer

use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.

the class CudaZeroHandler method relocateObject.

@Override
public synchronized void relocateObject(DataBuffer buffer) {
    AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(buffer);
    // we don't relocate non-DEVICE buffers (i.e HOST or CONSTANT)
    if (dstPoint.getAllocationStatus() != AllocationStatus.DEVICE)
        return;
    int deviceId = getDeviceId();
    if (dstPoint.getDeviceId() >= 0 && dstPoint.getDeviceId() == deviceId) {
        return;
    }
    // FIXME: cross-thread access, might cause problems
    if (!dstPoint.isActualOnHostSide())
        AtomicAllocator.getInstance().synchronizeHostData(buffer);
    if (!dstPoint.isActualOnHostSide())
        throw new RuntimeException("Buffer synchronization failed");
    if (buffer.isAttached() || dstPoint.isAttached()) {
        // if this buffer is Attached, we just relocate to new workspace
        MemoryWorkspace workspace = Nd4j.getMemoryManager().getCurrentWorkspace();
        if (workspace == null) {
            // if we're out of workspace, we should mark our buffer as detached, so gc will pick it up eventually
            alloc(AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), false);
            CudaContext context = getCudaContext();
            if (nativeOps.memcpyAsync(dstPoint.getDevicePointer(), dstPoint.getHostPointer(), buffer.length() * buffer.getElementSize(), 1, context.getSpecialStream()) == 0)
                throw new ND4JIllegalStateException("memcpyAsync failed");
            context.syncSpecialStream();
            // updating host pointer now
            alloc(AllocationStatus.HOST, dstPoint, dstPoint.getShape(), false);
            // marking it as detached
            dstPoint.setAttached(false);
            // marking it as proper on device
            dstPoint.tickHostRead();
            dstPoint.tickDeviceWrite();
        } else {
            // this call will automagically take care of workspaces, so it'll be either
            // log.info("Relocating to deviceId [{}], workspace [{}]...", deviceId, workspace.getId());
            BaseCudaDataBuffer nBuffer = (BaseCudaDataBuffer) Nd4j.createBuffer(buffer.length());
            Nd4j.getMemoryManager().memcpy(nBuffer, buffer);
            dstPoint.getPointers().setDevicePointer(nBuffer.getAllocationPoint().getDevicePointer());
            dstPoint.getPointers().setHostPointer(nBuffer.getAllocationPoint().getHostPointer());
            dstPoint.setDeviceId(deviceId);
            dstPoint.tickDeviceRead();
            dstPoint.tickHostRead();
        }
        return;
    }
    if (buffer.isConstant()) {
        // we can't relocate or modify buffers
        throw new RuntimeException("Can't relocateObject() for constant buffer");
    } else {
        // log.info("Free relocateObject: deviceId: {}, pointer: {}", deviceId, dstPoint.getPointers().getDevicePointer().address());
        memoryProvider.free(dstPoint);
        deviceMemoryTracker.subFromAllocation(Thread.currentThread().getId(), dstPoint.getDeviceId(), AllocationUtils.getRequiredMemory(dstPoint.getShape()));
        // we replace original device pointer with new one
        alloc(AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), false);
        // log.info("Pointer after alloc: {}", dstPoint.getPointers().getDevicePointer().address());
        CudaContext context = getCudaContext();
        if (nativeOps.memcpyAsync(dstPoint.getDevicePointer(), dstPoint.getHostPointer(), buffer.length() * buffer.getElementSize(), 1, context.getSpecialStream()) == 0)
            throw new ND4JIllegalStateException("memcpyAsync failed");
        context.syncSpecialStream();
        dstPoint.tickDeviceRead();
        dstPoint.tickHostRead();
    }
}

Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) MemoryWorkspace(org.nd4j.linalg.api.memory.MemoryWorkspace) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 5 with BaseCudaDataBuffer

use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpyAsync.

/**
 * Asynchronous version of memcpy
 *
 * PLEASE NOTE: This is device-dependent method, if it's not supported in your environment, blocking call will be used instead.
 *
 * @param dstBuffer
 * @param srcPointer
 * @param length
 * @param dstOffset
 */
@Override
public void memcpyAsync(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
    AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    // we update host memory regardless.
    // Pointer dP = new Pointer((point.getAllocationStatus() == AllocationStatus.DEVICE ? point.getPointers().getDevicePointer().address() : point.getPointers().getHostPointer().address()) + dstOffset);
    Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
    // Pointer sP = new Pointer(srcPointer.getNativePointer());
    // log.info("Location: " + point.getAllocationStatus());
    // if (length > 4)
    // log.info("memcpyAsync:  ["+ srcPointer.getNativePointer()+"] -> ["+ dP.getNativePointer()+"], length: [" + length+ "], offset: ["+ dstOffset+"], dstBufferOffset: ["+(dstBuffer.getElementSize() * dstBuffer.offset()) + "/" + dstBuffer.offset() +"]");
    CudaContext tContext = null;
    if (dstBuffer.isConstant()) {
        org.bytedeco.javacpp.Pointer dstPointer = new CudaPointer(point.getPointers().getHostPointer().address() + dstOffset, 0L);
        org.bytedeco.javacpp.Pointer srcPointerJ = new CudaPointer(srcPointer, length);
        // log.info("JCPP Memcpy: [{}] -> [{}], length: [{}]", srcPointerJ.address(), dstPointer.address(), length);
        org.bytedeco.javacpp.Pointer.memcpy(dstPointer, srcPointerJ, length);
        point.tickHostRead();
    } else {
        // log.info("Memcpy pointers: [{}] -> [{}]", srcPointer.address(),  dP.address());
        CudaContext context = flowController.prepareAction(point);
        tContext = context;
        if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync H2H failed: [" + srcPointer.address() + "] -> [" + dP.address() + "]");
        flowController.commitTransfer(tContext.getSpecialStream());
        if (point.getAllocationStatus() == AllocationStatus.HOST)
            flowController.registerAction(context, point);
    }
    // if we're copying something into host memory, but we're on device - we need to provide exact copy to device as well
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        // TODO: this sounds wrong, and probably memcpy whould check initial direction, like relocate did before
        Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
        if (tContext == null)
            tContext = flowController.prepareAction(point);
        if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, tContext.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync H2D failed: [" + dP.address() + "] -> [" + rDP.address() + "]");
        flowController.commitTransfer(tContext.getSpecialStream());
        flowController.registerAction(tContext, point);
    }
    point.tickDeviceWrite();
}

Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) Pointer(org.bytedeco.javacpp.Pointer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Aggregations

AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)7 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)6 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)5 Pointer (org.bytedeco.javacpp.Pointer)4 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)4 MemoryWorkspace (org.nd4j.linalg.api.memory.MemoryWorkspace)1