use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.
the class CudaZeroHandler method memcpySpecial.
/**
* Special memcpy version, addressing shapeInfoDataBuffer copies
*
* PLEASE NOTE: Blocking H->H, Async H->D
*
* @param dstBuffer
* @param srcPointer
* @param length
* @param dstOffset
*/
@Override
public void memcpySpecial(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
// log.info("Memcpy special: {} bytes ", length);
CudaContext context = getCudaContext();
AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
// context.syncOldStream();
Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getOldStream()) == 0)
throw new ND4JIllegalStateException("memcpyAsync failed");
if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0)
throw new ND4JIllegalStateException("memcpyAsync failed");
context.syncOldStream();
}
context.syncOldStream();
point.tickDeviceWrite();
// point.tickHostRead();
}
use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.
the class CudaZeroHandler method memcpy.
/**
* Synchronous version of memcpy.
*
* @param dstBuffer
* @param srcBuffer
*/
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
// log.info("Buffer MemCpy called");
// log.info("Memcpy buffer: {} bytes ", dstBuffer.length() * dstBuffer.getElementSize());
CudaContext context = getCudaContext();
AllocationPoint dstPoint = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
AllocationPoint srcPoint = ((BaseCudaDataBuffer) srcBuffer).getAllocationPoint();
Pointer dP = new CudaPointer(dstPoint.getPointers().getHostPointer().address());
Pointer sP = null;
if (srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
sP = new CudaPointer(srcPoint.getPointers().getDevicePointer().address());
/*
JCuda.cudaMemcpyAsync(
dP,
sP,
srcBuffer.length(),
cudaMemcpyKind.cudaMemcpyHostToDevice,
context.getOldStream()
);*/
if (nativeOps.memcpyAsync(dP, sP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
throw new ND4JIllegalStateException("memcpyAsync failed");
}
} else {
sP = new CudaPointer(srcPoint.getPointers().getHostPointer().address());
/*
JCuda.cudaMemcpyAsync(
dP,
sP,
srcBuffer.length(),
cudaMemcpyKind.cudaMemcpyHostToDevice,
context.getOldStream()
);*/
if (nativeOps.memcpyAsync(dP, sP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
throw new ND4JIllegalStateException("memcpyAsync failed");
}
}
if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
Pointer rDP = new CudaPointer(dstPoint.getPointers().getDevicePointer().address());
/*
JCuda.cudaMemcpyAsync(
rDP,
dP,
srcBuffer.length(),
cudaMemcpyKind.cudaMemcpyHostToDevice,
context.getOldStream()
);*/
if (nativeOps.memcpyAsync(rDP, dP, srcBuffer.length() * srcBuffer.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, context.getOldStream()) == 0) {
throw new ND4JIllegalStateException("memcpyAsync failed");
}
}
dstPoint.tickDeviceWrite();
// it has to be blocking call
context.syncOldStream();
}
use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.
the class CudaZeroHandler method getDevicePointer.
/**
* PLEASE NOTE: Specific implementation, on systems without special devices can return HostPointer here
*
* @param buffer
* @return
*/
@Override
public org.bytedeco.javacpp.Pointer getDevicePointer(DataBuffer buffer, CudaContext context) {
// TODO: It would be awesome to get rid of typecasting here
// getCudaContext().syncOldStream();
AllocationPoint dstPoint = ((BaseCudaDataBuffer) buffer).getAllocationPoint();
// here's the place, where we do care about promotion. but we only care about promotion of original buffers
if (dstPoint.getAllocationStatus() == AllocationStatus.HOST && buffer.offset() == 0 && 1 < 0) {
if (dstPoint.getDeviceTicks() > configuration.getMinimumRelocationThreshold()) {
// at this point we know, that this request is done withing some existent context
long requiredMemory = AllocationUtils.getRequiredMemory(dstPoint.getShape());
if (deviceMemoryTracker.reserveAllocationIfPossible(Thread.currentThread().getId(), getDeviceId(), requiredMemory) && pingDeviceForFreeMemory(getDeviceId(), requiredMemory)) {
// so, memory is reserved
promoteObject(buffer);
}
}
}
// if that's device state, we probably might want to update device memory state
if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
if (!dstPoint.isActualOnDeviceSide()) {
// log.info("Relocating to GPU");
relocate(AllocationStatus.HOST, AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), context);
} else {
// log.info("Buffer is actual on device side: " + dstPoint.getShape());
}
}
// else log.info("Not on [DEVICE]");
// we update memory use counter, to announce that it's somehow used on device
dstPoint.tickDeviceRead();
// return pointer with offset if needed. length is specified for constructor compatibility purposes
CudaPointer p = new CudaPointer(dstPoint.getPointers().getDevicePointer(), buffer.length(), (buffer.offset() * buffer.getElementSize()));
switch(buffer.dataType()) {
case DOUBLE:
return p.asDoublePointer();
case FLOAT:
return p.asFloatPointer();
case INT:
return p.asIntPointer();
case HALF:
return p.asShortPointer();
default:
return p;
}
}
use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.
the class CudaZeroHandler method relocateObject.
@Override
public synchronized void relocateObject(DataBuffer buffer) {
AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(buffer);
// we don't relocate non-DEVICE buffers (i.e HOST or CONSTANT)
if (dstPoint.getAllocationStatus() != AllocationStatus.DEVICE)
return;
int deviceId = getDeviceId();
if (dstPoint.getDeviceId() >= 0 && dstPoint.getDeviceId() == deviceId) {
return;
}
// FIXME: cross-thread access, might cause problems
if (!dstPoint.isActualOnHostSide())
AtomicAllocator.getInstance().synchronizeHostData(buffer);
if (!dstPoint.isActualOnHostSide())
throw new RuntimeException("Buffer synchronization failed");
if (buffer.isAttached() || dstPoint.isAttached()) {
// if this buffer is Attached, we just relocate to new workspace
MemoryWorkspace workspace = Nd4j.getMemoryManager().getCurrentWorkspace();
if (workspace == null) {
// if we're out of workspace, we should mark our buffer as detached, so gc will pick it up eventually
alloc(AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), false);
CudaContext context = getCudaContext();
if (nativeOps.memcpyAsync(dstPoint.getDevicePointer(), dstPoint.getHostPointer(), buffer.length() * buffer.getElementSize(), 1, context.getSpecialStream()) == 0)
throw new ND4JIllegalStateException("memcpyAsync failed");
context.syncSpecialStream();
// updating host pointer now
alloc(AllocationStatus.HOST, dstPoint, dstPoint.getShape(), false);
// marking it as detached
dstPoint.setAttached(false);
// marking it as proper on device
dstPoint.tickHostRead();
dstPoint.tickDeviceWrite();
} else {
// this call will automagically take care of workspaces, so it'll be either
// log.info("Relocating to deviceId [{}], workspace [{}]...", deviceId, workspace.getId());
BaseCudaDataBuffer nBuffer = (BaseCudaDataBuffer) Nd4j.createBuffer(buffer.length());
Nd4j.getMemoryManager().memcpy(nBuffer, buffer);
dstPoint.getPointers().setDevicePointer(nBuffer.getAllocationPoint().getDevicePointer());
dstPoint.getPointers().setHostPointer(nBuffer.getAllocationPoint().getHostPointer());
dstPoint.setDeviceId(deviceId);
dstPoint.tickDeviceRead();
dstPoint.tickHostRead();
}
return;
}
if (buffer.isConstant()) {
// we can't relocate or modify buffers
throw new RuntimeException("Can't relocateObject() for constant buffer");
} else {
// log.info("Free relocateObject: deviceId: {}, pointer: {}", deviceId, dstPoint.getPointers().getDevicePointer().address());
memoryProvider.free(dstPoint);
deviceMemoryTracker.subFromAllocation(Thread.currentThread().getId(), dstPoint.getDeviceId(), AllocationUtils.getRequiredMemory(dstPoint.getShape()));
// we replace original device pointer with new one
alloc(AllocationStatus.DEVICE, dstPoint, dstPoint.getShape(), false);
// log.info("Pointer after alloc: {}", dstPoint.getPointers().getDevicePointer().address());
CudaContext context = getCudaContext();
if (nativeOps.memcpyAsync(dstPoint.getDevicePointer(), dstPoint.getHostPointer(), buffer.length() * buffer.getElementSize(), 1, context.getSpecialStream()) == 0)
throw new ND4JIllegalStateException("memcpyAsync failed");
context.syncSpecialStream();
dstPoint.tickDeviceRead();
dstPoint.tickHostRead();
}
}
use of org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer in project nd4j by deeplearning4j.
the class CudaZeroHandler method memcpyAsync.
/**
* Asynchronous version of memcpy
*
* PLEASE NOTE: This is device-dependent method, if it's not supported in your environment, blocking call will be used instead.
*
* @param dstBuffer
* @param srcPointer
* @param length
* @param dstOffset
*/
@Override
public void memcpyAsync(DataBuffer dstBuffer, Pointer srcPointer, long length, long dstOffset) {
AllocationPoint point = ((BaseCudaDataBuffer) dstBuffer).getAllocationPoint();
// we update host memory regardless.
// Pointer dP = new Pointer((point.getAllocationStatus() == AllocationStatus.DEVICE ? point.getPointers().getDevicePointer().address() : point.getPointers().getHostPointer().address()) + dstOffset);
Pointer dP = new CudaPointer((point.getPointers().getHostPointer().address()) + dstOffset);
// Pointer sP = new Pointer(srcPointer.getNativePointer());
// log.info("Location: " + point.getAllocationStatus());
// if (length > 4)
// log.info("memcpyAsync: ["+ srcPointer.getNativePointer()+"] -> ["+ dP.getNativePointer()+"], length: [" + length+ "], offset: ["+ dstOffset+"], dstBufferOffset: ["+(dstBuffer.getElementSize() * dstBuffer.offset()) + "/" + dstBuffer.offset() +"]");
CudaContext tContext = null;
if (dstBuffer.isConstant()) {
org.bytedeco.javacpp.Pointer dstPointer = new CudaPointer(point.getPointers().getHostPointer().address() + dstOffset, 0L);
org.bytedeco.javacpp.Pointer srcPointerJ = new CudaPointer(srcPointer, length);
// log.info("JCPP Memcpy: [{}] -> [{}], length: [{}]", srcPointerJ.address(), dstPointer.address(), length);
org.bytedeco.javacpp.Pointer.memcpy(dstPointer, srcPointerJ, length);
point.tickHostRead();
} else {
// log.info("Memcpy pointers: [{}] -> [{}]", srcPointer.address(), dP.address());
CudaContext context = flowController.prepareAction(point);
tContext = context;
if (nativeOps.memcpyAsync(dP, srcPointer, length, CudaConstants.cudaMemcpyHostToHost, context.getSpecialStream()) == 0)
throw new IllegalStateException("MemcpyAsync H2H failed: [" + srcPointer.address() + "] -> [" + dP.address() + "]");
flowController.commitTransfer(tContext.getSpecialStream());
if (point.getAllocationStatus() == AllocationStatus.HOST)
flowController.registerAction(context, point);
}
// if we're copying something into host memory, but we're on device - we need to provide exact copy to device as well
if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
// TODO: this sounds wrong, and probably memcpy whould check initial direction, like relocate did before
Pointer rDP = new CudaPointer(point.getPointers().getDevicePointer().address() + dstOffset);
if (tContext == null)
tContext = flowController.prepareAction(point);
if (nativeOps.memcpyAsync(rDP, dP, length, CudaConstants.cudaMemcpyHostToDevice, tContext.getSpecialStream()) == 0)
throw new IllegalStateException("MemcpyAsync H2D failed: [" + dP.address() + "] -> [" + rDP.address() + "]");
flowController.commitTransfer(tContext.getSpecialStream());
flowController.registerAction(tContext, point);
}
point.tickDeviceWrite();
}
Aggregations