use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class ProtectedCudaConstantHandler method moveToConstantSpace.
/**
* This method moves specified dataBuffer to CUDA constant memory space.
*
* PLEASE NOTE: CUDA constant memory is limited to 48KB per device.
*
* @param dataBuffer
* @return
*/
@Override
public synchronized long moveToConstantSpace(DataBuffer dataBuffer) {
// now, we move things to constant memory
Integer deviceId = AtomicAllocator.getInstance().getDeviceId();
ensureMaps(deviceId);
AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(dataBuffer);
long requiredMemoryBytes = AllocationUtils.getRequiredMemory(point.getShape());
// logger.info("shape: " + point.getShape());
// and release device memory :)
long currentOffset = constantOffsets.get(deviceId).get();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
if (currentOffset + requiredMemoryBytes >= MAX_CONSTANT_LENGTH || requiredMemoryBytes > MAX_BUFFER_LENGTH) {
if (point.getAllocationStatus() == AllocationStatus.HOST && CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.DELAYED) {
AtomicAllocator.getInstance().getMemoryHandler().alloc(AllocationStatus.DEVICE, point, point.getShape(), false);
}
if (NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream()) == 0) {
throw new ND4JIllegalStateException("memcpyAsync failed");
}
flowController.commitTransfer(context.getSpecialStream());
point.setConstant(true);
point.tickDeviceWrite();
point.tickHostRead();
point.setDeviceId(deviceId);
protector.persistDataBuffer(dataBuffer);
return 0;
}
long bytes = requiredMemoryBytes;
// hack for misalignment avoidance for 16bit data opType
if (dataBuffer.dataType() == DataBuffer.Type.HALF) {
if (bytes % 4 != 0) {
bytes += 2;
}
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE || dataBuffer.dataType() == DataBuffer.Type.LONG) {
// for double data opType, we must be assured, that all DOUBLE pointers are starting from even addresses, to avoid banks spills
long div = bytes / 4;
if (div % 2 != 0)
bytes += 4;
// for possible changes of dtype in the same jvm, we skip few bytes in constant memory
div = currentOffset / 4;
while (div % 2 != 0) {
currentOffset = constantOffsets.get(deviceId).addAndGet(4);
div = currentOffset / 4;
// just break out, if we're stepped beyond constant memory space
if (currentOffset > MAX_CONSTANT_LENGTH)
break;
}
}
currentOffset = constantOffsets.get(deviceId).getAndAdd(bytes);
if (currentOffset >= MAX_CONSTANT_LENGTH) {
if (point.getAllocationStatus() == AllocationStatus.HOST && CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.DELAYED) {
AtomicAllocator.getInstance().getMemoryHandler().alloc(AllocationStatus.DEVICE, point, point.getShape(), false);
}
if (NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream()) == 0) {
throw new ND4JIllegalStateException("memcpyAsync failed");
}
flowController.commitTransfer(context.getSpecialStream());
point.setConstant(true);
point.tickDeviceWrite();
point.tickHostRead();
point.setDeviceId(deviceId);
protector.persistDataBuffer(dataBuffer);
return 0;
}
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyConstantAsync(currentOffset, point.getPointers().getHostPointer(), requiredMemoryBytes, 1, context.getSpecialStream());
flowController.commitTransfer(context.getSpecialStream());
long cAddr = deviceAddresses.get(deviceId).address() + currentOffset;
// if (resetHappened)
// logger.info("copying to constant: {}, bufferLength: {}, bufferDtype: {}, currentOffset: {}, currentAddres: {}", requiredMemoryBytes, dataBuffer.length(), dataBuffer.dataType(), currentOffset, cAddr);
point.setAllocationStatus(AllocationStatus.CONSTANT);
point.getPointers().setDevicePointer(new CudaPointer(cAddr));
point.setConstant(true);
point.tickDeviceWrite();
point.setDeviceId(deviceId);
point.tickHostRead();
protector.persistDataBuffer(dataBuffer);
return cAddr;
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class SynchronousFlowController method prepareAction.
@Override
public CudaContext prepareAction(INDArray result, INDArray... operands) {
CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
int cId = allocator.getDeviceId();
if (result != null) {
Nd4j.getCompressor().autoDecompress(result);
prepareDelayedMemory(result);
AllocationPoint pointData = allocator.getAllocationPoint(result);
AllocationPoint pointShape = allocator.getAllocationPoint(result.shapeInfoDataBuffer());
pointData.acquireLock();
if (pointData.getDeviceId() != cId && pointData.getDeviceId() >= 0 && (!CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() || !NativeOpsHolder.getInstance().getDeviceNativeOps().isP2PAvailable())) {
DataBuffer buffer = result.data().originalDataBuffer() == null ? result.data() : result.data().originalDataBuffer();
allocator.getMemoryHandler().relocateObject(buffer);
}
if (pointShape.getDeviceId() != cId && pointShape.getDeviceId() >= 0) {
((JCublasNDArray) result).setShapeInfoDataBuffer(Nd4j.getConstantHandler().relocateConstantSpace(result.shapeInfoDataBuffer()));
}
allocator.getAllocationPoint(result).setCurrentContext(context);
}
for (INDArray operand : operands) {
if (operand == null)
continue;
Nd4j.getCompressor().autoDecompress(operand);
AllocationPoint pointData = allocator.getAllocationPoint(operand);
AllocationPoint pointShape = allocator.getAllocationPoint(operand.shapeInfoDataBuffer());
pointData.acquireLock();
if (pointData.getDeviceId() != cId && pointData.getDeviceId() >= 0 && (!CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() || !NativeOpsHolder.getInstance().getDeviceNativeOps().isP2PAvailable())) {
DataBuffer buffer = operand.data().originalDataBuffer() == null ? operand.data() : operand.data().originalDataBuffer();
allocator.getMemoryHandler().relocateObject(buffer);
}
if (pointShape.getDeviceId() != cId && pointShape.getDeviceId() >= 0) {
((JCublasNDArray) operand).setShapeInfoDataBuffer(Nd4j.getConstantHandler().relocateConstantSpace(operand.shapeInfoDataBuffer()));
}
prepareDelayedMemory(operand);
allocator.getAllocationPoint(operand).setCurrentContext(context);
}
return context;
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaZeroHandler method promoteObject.
/**
* This method moves specific object from zero-copy memory to device memory
*
* PLEASE NOTE: DO NOT EVER USE THIS METHOD MANUALLY, UNLESS YOU 100% HAVE TO
*
* @return
*/
@Override
public boolean promoteObject(DataBuffer buffer) {
AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(buffer);
if (dstPoint.getAllocationStatus() != AllocationStatus.HOST)
return false;
if (configuration.getMemoryModel() == Configuration.MemoryModel.DELAYED && dstPoint.getAllocationStatus() == AllocationStatus.HOST) {
// if we have constant buffer (aka shapeInfo or other constant stuff)
if (buffer.isConstant()) {
Nd4j.getConstantHandler().moveToConstantSpace(buffer);
} else {
PointersPair pair = memoryProvider.malloc(dstPoint.getShape(), dstPoint, AllocationStatus.DEVICE);
if (pair != null) {
Integer deviceId = getDeviceId();
// log.info("Promoting object to device: [{}]", deviceId);
dstPoint.getPointers().setDevicePointer(pair.getDevicePointer());
dstPoint.setAllocationStatus(AllocationStatus.DEVICE);
deviceAllocations.get(deviceId).put(dstPoint.getObjectId(), dstPoint.getObjectId());
zeroAllocations.get(dstPoint.getBucketId()).remove(dstPoint.getObjectId());
deviceMemoryTracker.addToAllocation(Thread.currentThread().getId(), deviceId, AllocationUtils.getRequiredMemory(dstPoint.getShape()));
dstPoint.tickHostWrite();
} else
throw new RuntimeException("PewPew");
}
}
return true;
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaZeroHandler method memcpyAsync.
/**
* Asynchronous version of memcpy
*
* PLEASE NOTE: This is device-dependent method, if it's not supported in your environment, blocking call will be used instead.
*
* @param dstBuffer
* @param srcPointer
* @param length
* @param dstOffset
*/
@Override
public void memcpyAsync(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
// we update host memory regardless.
// Pointer dP = new Pointer((point.getAllocationStatus() == AllocationStatus.DEVICE ? point.getPointers().getDevicePointer().address() : point.getPointers().getHostPointer().address()) + dstOffset);
Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
// Pointer sP = new Pointer(srcPointer.getNativePointer());
// log.info("Location: " + point.getAllocationStatus());
// if (length > 4)
// log.info("memcpyAsync: ["+ srcPointer.getNativePointer()+"] -> ["+ dP.getNativePointer()+"], length: [" + length+ "], offset: ["+ dstOffset+"], dstBufferOffset: ["+(dstBuffer.getElementSize() * dstBuffer.offset()) + "/" + dstBuffer.offset() +"]");
CudaContext tContext = null;
if (dstBuffer.isConstant()) {
org.bytedeco.javacpp.Pointer dstPointer = new CudaPointer(point.getPointers().getHostPointer().address() + dstOffset, 0L);
org.bytedeco.javacpp.Pointer srcPointerJ = new CudaPointer(srcPointer, length);
// log.info("JCPP Memcpy: [{}] -> [{}], length: [{}]", srcPointerJ.address(), dstPointer.address(), length);
org.bytedeco.javacpp.Pointer.memcpy(dstPointer, srcPointerJ, length);
point.tickHostRead();
} else {
// log.info("Memcpy pointers: [{}] -> [{}]", srcPointer.address(), dP.address());
CudaContext context = flowController.prepareAction(point);
tContext = context;
if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getSpecialStream()) == 0)
throw new IllegalStateException("MemcpyAsync H2H failed: [" + srcPointer.address() + "] -> [" + dP.address() + "]");
flowController.commitTransfer(tContext.getSpecialStream());
if (point.getAllocationStatus() == AllocationStatus.HOST)
flowController.registerAction(context, point);
}
// if we're copying something into host memory, but we're on device - we need to provide exact copy to device as well
if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
// TODO: this sounds wrong, and probably memcpy whould check initial direction, like relocate did before
Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
if (tContext == null)
tContext = flowController.prepareAction(point);
if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, tContext.getSpecialStream()) == 0)
throw new IllegalStateException("MemcpyAsync H2D failed: [" + dP.address() + "] -> [" + rDP.address() + "]");
flowController.commitTransfer(tContext.getSpecialStream());
flowController.registerAction(tContext, point);
}
point.tickDeviceWrite();
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaZeroHandler method getHostPointer.
/**
* PLEASE NOTE: This method always returns pointer within OS memory space
*
* @param buffer
* @return
*/
@Override
public org.bytedeco.javacpp.Pointer getHostPointer(DataBuffer buffer) {
AllocationPoint dstPoint = ((BaseCudaDataBuffer) buffer).getAllocationPoint();
// return pointer with offset if needed. length is specified for constructor compatibility purposes
if (dstPoint.getPointers().getHostPointer() == null) {
log.info("DevicePointer: " + dstPoint.getPointers().getDevicePointer());
log.info("HostPointer: " + dstPoint.getPointers().getHostPointer());
log.info("AllocStatus: " + dstPoint.getAllocationStatus());
throw new RuntimeException("pointer is null");
}
// dstPoint.tickHostWrite();
// dstPoint.tickHostRead();
// log.info("Requesting host pointer for {}", buffer);
// getCudaContext().syncOldStream();
synchronizeThreadDevice(Thread.currentThread().getId(), dstPoint.getDeviceId(), dstPoint);
CudaPointer p = new CudaPointer(dstPoint.getPointers().getHostPointer(), buffer.length(), (buffer.offset() * buffer.getElementSize()));
switch(buffer.dataType()) {
case DOUBLE:
return p.asDoublePointer();
case FLOAT:
return p.asFloatPointer();
case INT:
return p.asIntPointer();
case HALF:
return p.asShortPointer();
default:
return p;
}
}
Aggregations