use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method convertDataEx.
public DataBuffer convertDataEx(DataBuffer.TypeEx typeSrc, DataBuffer source, DataBuffer.TypeEx typeDst) {
int elementSize = 0;
if (typeDst.ordinal() <= 2)
elementSize = 1;
else if (typeDst.ordinal() <= 5)
elementSize = 2;
else if (typeDst.ordinal() == 6)
elementSize = 4;
else if (typeDst.ordinal() == 7)
elementSize = 8;
throw new UnsupportedOperationException("Unknown target TypeEx: " +;
// flushQueue should be blocking here, because typeConversion happens on cpu side
DataBuffer buffer = null;
if (!(source instanceof CompressedDataBuffer))
if (CompressionUtils.goingToCompress(typeSrc, typeDst)) {
// all types below 8 are compression modes
BytePointer pointer = new BytePointer(source.length() * elementSize);
CompressionDescriptor descriptor = new CompressionDescriptor(source,;
descriptor.setCompressedLength(source.length() * elementSize);
buffer = new CompressedDataBuffer(pointer, descriptor);
} else {
CompressedDataBuffer compressed = (CompressedDataBuffer) source;
CompressionDescriptor descriptor = compressed.getCompressionDescriptor();
// decompression mode
buffer = Nd4j.createBuffer(descriptor.getNumberOfElements(), false);
AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(buffer);
convertDataEx(typeSrc, source, typeDst, buffer);
return buffer;
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaExecutioner method exec.
public <T extends Aggregate> void exec(Batch<T> batch) {
DataBuffer surfaceBuffer = getBuffer(batch);
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
IntPointer pointer = (IntPointer) new CudaPointer(AtomicAllocator.getInstance().getHostPointer(surfaceBuffer)).asIntPointer();
AllocationPoint surfacePoint = AtomicAllocator.getInstance().getAllocationPoint(surfaceBuffer);
int maxTypes = 5;
int maxIntArrays = batch.getSample().maxIntArrays();
int maxArraySize = batch.getSample().maxIntArraySize();
int indexPos = maxTypes * (Batch.getBatchLimit() * 16);
int intArraysPos = indexPos + (batch.getSample().maxIndexArguments() * (Batch.getBatchLimit() * 16));
int realPos = (intArraysPos + (maxIntArrays * maxArraySize * (Batch.getBatchLimit() * 16))) / (Nd4j.dataType() == DataBuffer.Type.DOUBLE ? 2 : 1);
if (Nd4j.dataType() == DataBuffer.Type.HALF)
realPos *= 2;
int argsPos = (realPos + (batch.getSample().maxRealArguments() * (Batch.getBatchLimit() * 16))) / (Nd4j.dataType() == DataBuffer.Type.FLOAT ? 2 : 1);
if (Nd4j.dataType() == DataBuffer.Type.HALF)
argsPos /= 4;
int shapesPos = argsPos + (batch.getSample().maxArguments() * (Batch.getBatchLimit() * 16));
for (int i = 0; i < batch.getNumAggregates(); i++) {
T op = batch.getAggregates().get(i);
// put num arguments
int idx = i * maxTypes;
pointer.put(idx, op.getArguments().size());
pointer.put(idx + 1, op.getShapes().size());
pointer.put(idx + 2, op.getIndexingArguments().size());
pointer.put(idx + 3, op.getRealArguments().size());
pointer.put(idx + 4, op.getIntArrayArguments().size());
// putting indexing arguments
for (int e = 0; e < op.getIndexingArguments().size(); e++) {
idx = indexPos + i * batch.getSample().maxIndexArguments();
pointer.put(idx + e, op.getIndexingArguments().get(e));
// putting intArray values
int bsize = maxIntArrays * maxArraySize;
for (int e = 0; e < op.getIntArrayArguments().size(); e++) {
int step = (i * bsize) + (e * maxArraySize);
if (op.getIntArrayArguments().get(e) != null)
for (int x = 0; x < op.getIntArrayArguments().get(e).length; x++) {
idx = intArraysPos + step + x;
pointer.put(idx, op.getIntArrayArguments().get(e)[x]);
// putting real arguments
if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
FloatPointer realPtr = new FloatPointer(pointer);
for (int e = 0; e < op.getRealArguments().size(); e++) {
idx = realPos + i * op.maxRealArguments();
realPtr.put(idx + e, op.getRealArguments().get(e).floatValue());
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
DoublePointer dPtr = new DoublePointer(pointer);
for (int e = 0; e < op.getRealArguments().size(); e++) {
idx = realPos + (i * op.maxRealArguments());
dPtr.put(idx + e, op.getRealArguments().get(e).doubleValue());
} else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
ShortPointer sPtr = new ShortPointer(pointer);
for (int e = 0; e < op.getRealArguments().size(); e++) {
idx = realPos + (i * op.maxRealArguments());
sPtr.put(idx + e, BaseDataBuffer.fromFloat(op.getRealArguments().get(e).floatValue()));
// putting arguments pointers
PointerPointer ptrPtr = new PointerPointer(pointer);
for (int e = 0; e < op.getArguments().size(); e++) {
idx = argsPos + i * batch.getSample().maxArguments();
if (op.getArguments().get(e) != null) {
ptrPtr.put(idx + e, AtomicAllocator.getInstance().getPointer(op.getArguments().get(e), context));
// putting shape pointers
for (int e = 0; e < op.getShapes().size(); e++) {
idx = shapesPos + i * batch.getSample().maxShapes();
if (op.getShapes().get(e) != null) {
ptrPtr.put(idx + e, AtomicAllocator.getInstance().getPointer(op.getShapes().get(e), context));
// trigger write, so getPointer request will force relocation to GPU
PointerPointer extraArgs = new PointerPointer(32);
extraArgs.put(0, null);
extraArgs.put(1, context.getOldStream());
extraArgs.put(2, new CudaPointer(Math.min(batch.getNumAggregates(), CudaEnvironment.getInstance().getConfiguration().getMaximumGridSize())));
extraArgs.put(3, new CudaPointer(batch.getSample().getThreadsPerInstance()));
extraArgs.put(4, new CudaPointer(batch.getSample().getSharedMemorySize()));
if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
nativeOps.execAggregateBatchFloat(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.execAggregateBatchDouble(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
} else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
nativeOps.execAggregateBatchHalf(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class BaseCudaDataBuffer method reallocate.
public DataBuffer reallocate(long length) {
// we want to be sure this array isn't used anywhere RIGHT AT THIS MOMENT
AllocationPoint old = allocationPoint;
allocationPoint = AtomicAllocator.getInstance().allocateMemory(this, new AllocationShape(length, elementSize, dataType()), false);
trackingPoint = allocationPoint.getObjectId();
switch(dataType()) {
case DOUBLE:
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length, 0).asDoublePointer();
indexer = DoubleIndexer.create((DoublePointer) pointer);
case FLOAT:
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length, 0).asFloatPointer();
indexer = FloatIndexer.create((FloatPointer) pointer);
case HALF:
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length, 0).asShortPointer();
indexer = ShortIndexer.create((ShortPointer) pointer);
case INT:
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length, 0).asIntPointer();
indexer = IntIndexer.create((IntPointer) pointer);
throw new UnsupportedOperationException();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
NativeOpsHolder.getInstance().getDeviceNativeOps().memsetAsync(allocationPoint.getDevicePointer(), 0, length * elementSize, 0, context.getSpecialStream());
if (old.isActualOnDeviceSide()) {
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(allocationPoint.getDevicePointer(), old.getDevicePointer(), this.length * elementSize, CudaConstants.cudaMemcpyDeviceToDevice, context.getSpecialStream());
} else if (old.isActualOnHostSide()) {
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(allocationPoint.getDevicePointer(), old.getHostPointer(), this.length * elementSize, CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream());
// we're keeping pointer reference for JVM
// this.length = length;
if (isAttached()) {
// do nothing here, that's workspaces
} else {
return this;
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class BaseCudaDataBuffer method read.
public void read(DataInputStream s) {
try {
//"Restoring CUDA databuffer");
// skip allocationMode
allocationMode = AllocationMode.JAVACPP;
int locLength = s.readInt();
boolean reallocate = locLength != length || indexer == null;
length = locLength;
Type t = Type.valueOf(s.readUTF());
//"Restoring buffer ["+t+"] of length ["+ length+"]");
if (globalType == null && Nd4j.dataType() != null) {
globalType = Nd4j.dataType();
if (t != globalType && t != Type.INT && Nd4j.sizeOfDataType(globalType) < Nd4j.sizeOfDataType(t)) {
log.warn("Loading a data stream with opType different from what is set globally. Expect precision loss");
if (globalType == Type.INT)
log.warn("Int to float/double widening UNSUPPORTED!!!");
if (t == Type.COMPRESSED) {
type = t;
} else if (t == Type.INT || globalType == Type.INT) {
this.elementSize = 4;
this.allocationPoint = AtomicAllocator.getInstance().allocateMemory(this, new AllocationShape(length, elementSize, t), false);
this.trackingPoint = allocationPoint.getObjectId();
// we keep int buffer's dtype after ser/de
this.type = t;
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length).asIntPointer();
indexer = IntIndexer.create((IntPointer) pointer);
IntIndexer Iindexer = (IntIndexer) indexer;
int[] array = new int[(int) length];
for (int i = 0; i < length(); i++) {
if (t == Type.INT)
// array[i] = s.readInt();
Iindexer.put(i, s.readInt());
else if (t == Type.DOUBLE)
Iindexer.put(i, (int) s.readDouble());
else if (t == Type.FLOAT)
Iindexer.put(i, (int) s.readFloat());
else if (t == Type.HALF)
Iindexer.put(i, (int) toFloat((int) s.readShort()));
} else if (globalType == Type.DOUBLE) {
this.elementSize = 8;
if (reallocate) {
MemoryWorkspace workspace = Nd4j.getMemoryManager().getCurrentWorkspace();
if (workspace != null && (workspace instanceof DummyWorkspace)) {
this.attached = true;
this.parentWorkspace = workspace;
workspaceGenerationId = workspace.getGenerationId();
this.allocationPoint = AtomicAllocator.getInstance().allocateMemory(this, new AllocationShape(length, elementSize, globalType), false);
// allocationPoint.attachBuffer(this);
this.trackingPoint = allocationPoint.getObjectId();
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length).asDoublePointer();
indexer = DoubleIndexer.create((DoublePointer) pointer);
DoubleIndexer Dindexer = (DoubleIndexer) indexer;
for (int i = 0; i < length(); i++) {
if (t == Type.DOUBLE)
Dindexer.put(i, s.readDouble());
else if (t == Type.FLOAT)
Dindexer.put(i, (double) s.readFloat());
else if (t == Type.HALF)
Dindexer.put(i, (double) toFloat((int) s.readShort()));
} else if (globalType == Type.FLOAT) {
this.elementSize = 4;
if (reallocate) {
this.allocationPoint = AtomicAllocator.getInstance().allocateMemory(this, new AllocationShape(length, elementSize, dataType()), false);
this.trackingPoint = allocationPoint.getObjectId();
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length).asFloatPointer();
indexer = FloatIndexer.create((FloatPointer) pointer);
FloatIndexer Findexer = (FloatIndexer) indexer;
for (int i = 0; i < length; i++) {
if (t == Type.DOUBLE)
Findexer.put(i, (float) s.readDouble());
else if (t == Type.FLOAT)
Findexer.put(i, s.readFloat());
else if (t == Type.HALF) {
Findexer.put(i, toFloat((int) s.readShort()));
} else if (globalType == Type.HALF) {
this.elementSize = 2;
if (reallocate) {
this.allocationPoint = AtomicAllocator.getInstance().allocateMemory(this, new AllocationShape(length, elementSize, dataType()), false);
this.trackingPoint = allocationPoint.getObjectId();
this.pointer = new CudaPointer(allocationPoint.getPointers().getHostPointer(), length).asShortPointer();
indexer = HalfIndexer.create((ShortPointer) this.pointer);
HalfIndexer Hindexer = (HalfIndexer) indexer;
for (int i = 0; i < length; i++) {
if (t == Type.DOUBLE)
Hindexer.put(i, (float) s.readDouble());
else if (t == Type.FLOAT)
Hindexer.put(i, s.readFloat());
else if (t == Type.HALF) {
Hindexer.put(i, toFloat((int) s.readShort()));
// for HALF & HALF2 datatype we just tag data as fresh on host
} else
throw new IllegalStateException("Unknown dataType: [" + t.toString() + "]");
this.wrappedBuffer = this.pointer.asByteBuffer();
} catch (Exception e) {
throw new RuntimeException(e);
// we call sync to copyback data to host
// allocator.synchronizeHostData(this);
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class JCublasNDArray method unsafeDuplication.
public INDArray unsafeDuplication(boolean blocking) {
DataBuffer rb = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createSame(, false) : Nd4j.getDataBufferFactory().createSame(, false, Nd4j.getMemoryManager().getCurrentWorkspace());
INDArray ret = Nd4j.createArrayFromShapeBuffer(rb, this.shapeInfoDataBuffer());
if (blocking)
// Nd4j.getExecutioner().commit();
AtomicAllocator allocator = AtomicAllocator.getInstance();
CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
AllocationPoint srcPoint = allocator.getAllocationPoint(this);
AllocationPoint dstPoint = allocator.getAllocationPoint(ret);
int route = 0;
if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE && srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
// d2d copy
route = 1;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getDevicePointer(), srcPoint.getDevicePointer(), *, CudaConstants.cudaMemcpyDeviceToDevice, blocking ? context.getOldStream() : context.getSpecialStream());
} else if (dstPoint.getAllocationStatus() == AllocationStatus.HOST && srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
route = 2;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getHostPointer(), srcPoint.getDevicePointer(), *, CudaConstants.cudaMemcpyDeviceToHost, blocking ? context.getOldStream() : context.getSpecialStream());
} else if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE && srcPoint.getAllocationStatus() == AllocationStatus.HOST) {
route = 3;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getDevicePointer(), srcPoint.getHostPointer(), *, CudaConstants.cudaMemcpyHostToDevice, blocking ? context.getOldStream() : context.getSpecialStream());
} else {
route = 4;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getHostPointer(), srcPoint.getHostPointer(), *, CudaConstants.cudaMemcpyHostToHost, blocking ? context.getOldStream() : context.getSpecialStream());
if (blocking)
long time2 = System.currentTimeMillis();
long bytes = *;
long spent = time2 - time1;
float bw = (1000 * bytes / spent) / 1024 / 1024.0f / 1024; //1000 / spent * bytes / 1024 / 1024 / 1024;"Route: [{}]; Blocking: {}; {} bytes; {} ms; Bandwidth: {} GB/s", route, blocking, bytes, spent, String.format("%.2f", bw));
return ret;