Search in sources :

Example 31 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class ProtectedCudaConstantHandler method moveToConstantSpace.

/**
 * This method moves specified dataBuffer to CUDA constant memory space.
 *
 * PLEASE NOTE: CUDA constant memory is limited to 48KB per device.
 *
 * @param dataBuffer
 * @return
 */
@Override
public synchronized long moveToConstantSpace(DataBuffer dataBuffer) {
    // now, we move things to constant memory
    Integer deviceId = AtomicAllocator.getInstance().getDeviceId();
    ensureMaps(deviceId);
    AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(dataBuffer);
    long requiredMemoryBytes = AllocationUtils.getRequiredMemory(point.getShape());
    // logger.info("shape: " + point.getShape());
    // and release device memory :)
    long currentOffset = constantOffsets.get(deviceId).get();
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    if (currentOffset + requiredMemoryBytes >= MAX_CONSTANT_LENGTH || requiredMemoryBytes > MAX_BUFFER_LENGTH) {
        if (point.getAllocationStatus() == AllocationStatus.HOST && CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.DELAYED) {
            AtomicAllocator.getInstance().getMemoryHandler().alloc(AllocationStatus.DEVICE, point, point.getShape(), false);
        }
        if (NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
        flowController.commitTransfer(context.getSpecialStream());
        point.setConstant(true);
        point.tickDeviceWrite();
        point.tickHostRead();
        point.setDeviceId(deviceId);
        protector.persistDataBuffer(dataBuffer);
        return 0;
    }
    long bytes = requiredMemoryBytes;
    // hack for misalignment avoidance for 16bit data opType
    if (dataBuffer.dataType() == DataBuffer.Type.HALF) {
        if (bytes % 4 != 0) {
            bytes += 2;
        }
    } else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE || dataBuffer.dataType() == DataBuffer.Type.LONG) {
        // for double data opType, we must be assured, that all DOUBLE pointers are starting from even addresses, to avoid banks spills
        long div = bytes / 4;
        if (div % 2 != 0)
            bytes += 4;
        // for possible changes of dtype in the same jvm, we skip few bytes in constant memory
        div = currentOffset / 4;
        while (div % 2 != 0) {
            currentOffset = constantOffsets.get(deviceId).addAndGet(4);
            div = currentOffset / 4;
            // just break out, if we're stepped beyond constant memory space
            if (currentOffset > MAX_CONSTANT_LENGTH)
                break;
        }
    }
    currentOffset = constantOffsets.get(deviceId).getAndAdd(bytes);
    if (currentOffset >= MAX_CONSTANT_LENGTH) {
        if (point.getAllocationStatus() == AllocationStatus.HOST && CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.DELAYED) {
            AtomicAllocator.getInstance().getMemoryHandler().alloc(AllocationStatus.DEVICE, point, point.getShape(), false);
        }
        if (NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream()) == 0) {
            throw new ND4JIllegalStateException("memcpyAsync failed");
        }
        flowController.commitTransfer(context.getSpecialStream());
        point.setConstant(true);
        point.tickDeviceWrite();
        point.tickHostRead();
        point.setDeviceId(deviceId);
        protector.persistDataBuffer(dataBuffer);
        return 0;
    }
    NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyConstantAsync(currentOffset, point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream());
    flowController.commitTransfer(context.getSpecialStream());
    long cAddr = deviceAddresses.get(deviceId).address() + currentOffset;
    // if (resetHappened)
    // logger.info("copying to constant: {}, bufferLength: {}, bufferDtype: {}, currentOffset: {}, currentAddres: {}", requiredMemoryBytes, dataBuffer.length(), dataBuffer.dataType(), currentOffset, cAddr);
    point.setAllocationStatus(AllocationStatus.CONSTANT);
    point.getPointers().setDevicePointer(new CudaPointer(cAddr));
    point.setConstant(true);
    point.tickDeviceWrite();
    point.setDeviceId(deviceId);
    point.tickHostRead();
    protector.persistDataBuffer(dataBuffer);
    return cAddr;
}
Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 32 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class SynchronousFlowController method prepareAction.

@Override
public CudaContext prepareAction(INDArray result, INDArray... operands) {
    CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
    int cId = allocator.getDeviceId();
    if (result != null) {
        Nd4j.getCompressor().autoDecompress(result);
        prepareDelayedMemory(result);
        AllocationPoint pointData = allocator.getAllocationPoint(result);
        AllocationPoint pointShape = allocator.getAllocationPoint(result.shapeInfoDataBuffer());
        pointData.acquireLock();
        if (pointData.getDeviceId() != cId && pointData.getDeviceId() >= 0 && (!CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() || !NativeOpsHolder.getInstance().getDeviceNativeOps().isP2PAvailable())) {
            DataBuffer buffer = result.data().originalDataBuffer() == null ? result.data() : result.data().originalDataBuffer();
            allocator.getMemoryHandler().relocateObject(buffer);
        }
        if (pointShape.getDeviceId() != cId && pointShape.getDeviceId() >= 0) {
            ((JCublasNDArray) result).setShapeInfoDataBuffer(Nd4j.getConstantHandler().relocateConstantSpace(result.shapeInfoDataBuffer()));
        }
        allocator.getAllocationPoint(result).setCurrentContext(context);
    }
    for (INDArray operand : operands) {
        if (operand == null)
            continue;
        Nd4j.getCompressor().autoDecompress(operand);
        AllocationPoint pointData = allocator.getAllocationPoint(operand);
        AllocationPoint pointShape = allocator.getAllocationPoint(operand.shapeInfoDataBuffer());
        pointData.acquireLock();
        if (pointData.getDeviceId() != cId && pointData.getDeviceId() >= 0 && (!CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() || !NativeOpsHolder.getInstance().getDeviceNativeOps().isP2PAvailable())) {
            DataBuffer buffer = operand.data().originalDataBuffer() == null ? operand.data() : operand.data().originalDataBuffer();
            allocator.getMemoryHandler().relocateObject(buffer);
        }
        if (pointShape.getDeviceId() != cId && pointShape.getDeviceId() >= 0) {
            ((JCublasNDArray) operand).setShapeInfoDataBuffer(Nd4j.getConstantHandler().relocateConstantSpace(operand.shapeInfoDataBuffer()));
        }
        prepareDelayedMemory(operand);
        allocator.getAllocationPoint(operand).setCurrentContext(context);
    }
    return context;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) JCublasNDArray(org.nd4j.linalg.jcublas.JCublasNDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 33 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class CudaZeroHandler method promoteObject.

/**
 * This method moves specific object from zero-copy memory to device memory
 *
 * PLEASE NOTE:  DO NOT EVER USE THIS METHOD MANUALLY, UNLESS YOU 100% HAVE TO
 *
 * @return
 */
@Override
public boolean promoteObject(DataBuffer buffer) {
    AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(buffer);
    if (dstPoint.getAllocationStatus() != AllocationStatus.HOST)
        return false;
    if (configuration.getMemoryModel() == Configuration.MemoryModel.DELAYED && dstPoint.getAllocationStatus() == AllocationStatus.HOST) {
        // if we have constant buffer (aka shapeInfo or other constant stuff)
        if (buffer.isConstant()) {
            Nd4j.getConstantHandler().moveToConstantSpace(buffer);
        } else {
            PointersPair pair = memoryProvider.malloc(dstPoint.getShape(), dstPoint, AllocationStatus.DEVICE);
            if (pair != null) {
                Integer deviceId = getDeviceId();
                // log.info("Promoting object to device: [{}]", deviceId);
                dstPoint.getPointers().setDevicePointer(pair.getDevicePointer());
                dstPoint.setAllocationStatus(AllocationStatus.DEVICE);
                deviceAllocations.get(deviceId).put(dstPoint.getObjectId(), dstPoint.getObjectId());
                zeroAllocations.get(dstPoint.getBucketId()).remove(dstPoint.getObjectId());
                deviceMemoryTracker.addToAllocation(Thread.currentThread().getId(), deviceId, AllocationUtils.getRequiredMemory(dstPoint.getShape()));
                dstPoint.tickHostWrite();
            } else
                throw new RuntimeException("PewPew");
        }
    }
    return true;
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PointersPair(org.nd4j.jita.allocator.pointers.PointersPair) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 34 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class CudaZeroHandler method memcpyAsync.

/**
 * Asynchronous version of memcpy
 *
 * PLEASE NOTE: This is device-dependent method, if it's not supported in your environment, blocking call will be used instead.
 *
 * @param dstBuffer
 * @param srcPointer
 * @param length
 * @param dstOffset
 */
@Override
public void memcpyAsync(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
    AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
    // we update host memory regardless.
    // Pointer dP = new Pointer((point.getAllocationStatus() == AllocationStatus.DEVICE ? point.getPointers().getDevicePointer().address() : point.getPointers().getHostPointer().address()) + dstOffset);
    Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
    // Pointer sP = new Pointer(srcPointer.getNativePointer());
    // log.info("Location: " + point.getAllocationStatus());
    // if (length > 4)
    // log.info("memcpyAsync:  ["+ srcPointer.getNativePointer()+"] -> ["+ dP.getNativePointer()+"], length: [" + length+ "], offset: ["+ dstOffset+"], dstBufferOffset: ["+(dstBuffer.getElementSize() * dstBuffer.offset()) + "/" + dstBuffer.offset() +"]");
    CudaContext tContext = null;
    if (dstBuffer.isConstant()) {
        org.bytedeco.javacpp.Pointer dstPointer = new CudaPointer(point.getPointers().getHostPointer().address() + dstOffset, 0L);
        org.bytedeco.javacpp.Pointer srcPointerJ = new CudaPointer(srcPointer, length);
        // log.info("JCPP Memcpy: [{}] -> [{}], length: [{}]", srcPointerJ.address(), dstPointer.address(), length);
        org.bytedeco.javacpp.Pointer.memcpy(dstPointer, srcPointerJ, length);
        point.tickHostRead();
    } else {
        // log.info("Memcpy pointers: [{}] -> [{}]", srcPointer.address(),  dP.address());
        CudaContext context = flowController.prepareAction(point);
        tContext = context;
        if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync H2H failed: [" + srcPointer.address() + "] -> [" + dP.address() + "]");
        flowController.commitTransfer(tContext.getSpecialStream());
        if (point.getAllocationStatus() == AllocationStatus.HOST)
            flowController.registerAction(context, point);
    }
    // if we're copying something into host memory, but we're on device - we need to provide exact copy to device as well
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        // TODO: this sounds wrong, and probably memcpy whould check initial direction, like relocate did before
        Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
        if (tContext == null)
            tContext = flowController.prepareAction(point);
        if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, tContext.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync H2D failed: [" + dP.address() + "] -> [" + rDP.address() + "]");
        flowController.commitTransfer(tContext.getSpecialStream());
        flowController.registerAction(tContext, point);
    }
    point.tickDeviceWrite();
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) Pointer(org.bytedeco.javacpp.Pointer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 35 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class CudaZeroHandler method getHostPointer.

/**
 * PLEASE NOTE: This method always returns pointer within OS memory space
 *
 * @param buffer
 * @return
 */
@Override
public org.bytedeco.javacpp.Pointer getHostPointer(DataBuffer buffer) {
    AllocationPoint dstPoint = ((BaseCudaDataBuffer) buffer).getAllocationPoint();
    // return pointer with offset if needed. length is specified for constructor compatibility purposes
    if (dstPoint.getPointers().getHostPointer() == null) {
        log.info("DevicePointer: " + dstPoint.getPointers().getDevicePointer());
        log.info("HostPointer: " + dstPoint.getPointers().getHostPointer());
        log.info("AllocStatus: " + dstPoint.getAllocationStatus());
        throw new RuntimeException("pointer is null");
    }
    // dstPoint.tickHostWrite();
    // dstPoint.tickHostRead();
    // log.info("Requesting host pointer for {}", buffer);
    // getCudaContext().syncOldStream();
    synchronizeThreadDevice(Thread.currentThread().getId(), dstPoint.getDeviceId(), dstPoint);
    CudaPointer p = new CudaPointer(dstPoint.getPointers().getHostPointer(), buffer.length(), (buffer.offset() * buffer.getElementSize()));
    switch(buffer.dataType()) {
        case DOUBLE:
            return p.asDoublePointer();
        case FLOAT:
            return p.asFloatPointer();
        case INT:
            return p.asIntPointer();
        case HALF:
            return p.asShortPointer();
        default:
            return p;
    }
}
Also used : BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Aggregations

AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)67 INDArray (org.nd4j.linalg.api.ndarray.INDArray)33 Test (org.junit.Test)31 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)24 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)15 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)11 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)11 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)7 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 Pointer (org.bytedeco.javacpp.Pointer)6 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)5 PointersPair (org.nd4j.jita.allocator.pointers.PointersPair)5 MemoryWorkspace (org.nd4j.linalg.api.memory.MemoryWorkspace)4 JCublasNDArray (org.nd4j.linalg.jcublas.JCublasNDArray)3 CudaDoubleDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)3 CompressedDataBuffer (org.nd4j.linalg.compression.CompressedDataBuffer)2 DeviceLocalNDArray (org.nd4j.linalg.util.DeviceLocalNDArray)2 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 FileInputStream (java.io.FileInputStream)1