use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaFullCachingProvider method free.
/**
* This method frees specific chunk of memory, described by AllocationPoint passed in
*
* PLEASE NOTE: This method can actually ignore free, and keep released memory chunk for future reuse.
*
* @param point
*/
@Override
public void free(AllocationPoint point) {
if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
if (point.isConstant())
return;
AllocationShape shape = point.getShape();
int deviceId = point.getDeviceId();
long address = point.getDevicePointer().address();
long reqMemory = AllocationUtils.getRequiredMemory(shape);
if (reqMemory > CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceCacheableLength() || deviceCachedAmount.get(deviceId).get() >= CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache()) {
// log.info("DEVICE_{} memory purging: {} bytes; MS: {}; MT: {}", deviceId, reqMemory, MAX_GPU_ALLOCATION, MAX_GPU_CACHE);
super.free(point);
return;
}
// log.info("Saving HOST memory into cache...");
ensureDeviceCacheHolder(deviceId, shape);
CacheHolder cache = deviceCache.get(deviceId).get(shape);
if (point.getDeviceId() != deviceId)
throw new RuntimeException("deviceId changed!");
// memory chunks < threshold will be cached no matter what
if (reqMemory <= FORCED_CACHE_THRESHOLD) {
cache.put(new CudaPointer(point.getDevicePointer().address()));
return;
} else {
long cacheEntries = cache.size();
long cacheHeight = deviceCache.get(deviceId).size();
// total memory allocated within this bucket
long cacheDepth = cacheEntries * reqMemory;
// if (cacheDepth < MAX_CACHED_MEMORY / cacheHeight) {
cache.put(new CudaPointer(point.getDevicePointer().address()));
return;
// } else {
// super.free(point);
// }
}
}
super.free(point);
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method toFlattened.
@Override
public INDArray toFlattened(char order, Collection<INDArray> matrices) {
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
int length = 0;
for (INDArray m : matrices) length += m.length();
INDArray ret = Nd4j.create(new int[] { 1, length }, order);
int linearIndex = 0;
AtomicAllocator allocator = AtomicAllocator.getInstance();
for (INDArray m : matrices) {
CudaContext context = allocator.getFlowController().prepareAction(ret, m);
if (m.ordering() == order && ret.elementWiseStride() == m.elementWiseStride() && ret.elementWiseStride() == 1) {
// do memcpy in proper direction and forget about that
allocator.memcpyAsync(ret.data(), new CudaPointer(allocator.getHostPointer(m).address()), AllocationUtils.getRequiredMemory(AllocationUtils.buildAllocationShape(m)), linearIndex * (m.data().dataType() == DataBuffer.Type.DOUBLE ? 8 : m.data().dataType() == DataBuffer.Type.FLOAT ? 4 : 2));
linearIndex += m.length();
} else {
Pointer hostYShapeInfo = AddressRetriever.retrieveHostPointer(m.shapeInfoDataBuffer());
PointerPointer extras = new PointerPointer(AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), hostYShapeInfo, AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()));
if (m.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.flattenDouble(extras, linearIndex, order, (DoublePointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (DoublePointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
} else if (m.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.flattenFloat(extras, linearIndex, order, (FloatPointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (FloatPointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
} else {
nativeOps.flattenHalf(extras, linearIndex, order, (ShortPointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (ShortPointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
}
// Works for all cases...
/* NdIndexIterator iter = new NdIndexIterator(order, m.shape());
while (iter.hasNext()) {
ret.putScalar(linearIndex++, m.getDouble(iter.next()));
}*/
linearIndex += m.length();
}
if (ret != null)
allocator.registerAction(context, ret, m);
}
return ret;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method sort.
@Override
public INDArray sort(INDArray x, boolean descending) {
if (x.isScalar())
return x;
Nd4j.getExecutioner().push();
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(x);
Pointer ptr = AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer());
PointerPointer extraz = new // 0
PointerPointer(// 0
ptr, // 1
context.getOldStream(), // 2
AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
context.getBufferAllocation(), // 4
context.getBufferReduction(), // 5
context.getBufferScalar(), // 6
context.getBufferSpecial(), // 7
ptr, // 8
AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer()), // 9
ptr, // 10
ptr, // 11
ptr, // 12
ptr, // 13
ptr, // 14
ptr, // special pointer for IsMax // 15
ptr, // special pointer for IsMax // 16
ptr, // special pointer for IsMax // 17
ptr, new CudaPointer(0));
// we're sending > 10m elements to radixSort
boolean isRadix = !x.isView() && (x.lengthLong() > 1024 * 1024 * 10);
INDArray tmpX = x;
// we need to guarantee all threads are finished here
if (isRadix)
Nd4j.getExecutioner().commit();
if (x.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.sortFloat(extraz, (FloatPointer) AtomicAllocator.getInstance().getPointer(tmpX, context), (IntPointer) AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
} else if (x.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.sortDouble(extraz, (DoublePointer) AtomicAllocator.getInstance().getPointer(tmpX, context), (IntPointer) AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
} else if (x.data().dataType() == DataBuffer.Type.HALF) {
nativeOps.sortHalf(extraz, (ShortPointer) AtomicAllocator.getInstance().getPointer(tmpX, context), (IntPointer) AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
} else {
throw new UnsupportedOperationException("Unknown dataType " + x.data().dataType());
}
AtomicAllocator.getInstance().getFlowController().registerAction(context, x);
return x;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method average.
@Override
public INDArray average(INDArray target, INDArray[] arrays) {
if (arrays == null || arrays.length == 0)
throw new RuntimeException("Input arrays are missing");
if (arrays.length == 1)
return target.assign(arrays[0]);
// we do averaging on GPU only if ALL devices have p2p links
if (nativeOps.isP2PAvailable() && CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed()) {
Nd4j.getExecutioner().push();
long len = target != null ? target.lengthLong() : arrays[0].lengthLong();
AtomicAllocator allocator = AtomicAllocator.getInstance();
CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
PointerPointer extras = new // not used
PointerPointer(// not used
null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
Pointer z = target == null ? null : AtomicAllocator.getInstance().getPointer(target, context);
long[] xPointers = new long[arrays.length];
for (int i = 0; i < arrays.length; i++) {
if (arrays[i].elementWiseStride() != 1)
throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
if (arrays[i].lengthLong() != len)
throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
xPointers[i] = point.getPointers().getDevicePointer().address();
point.tickDeviceWrite();
}
CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.averageDouble(extras, x, target == null ? null : (DoublePointer) z, arrays.length, len, true);
} else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.averageFloat(extras, x, target == null ? null : (FloatPointer) z, arrays.length, len, true);
} else {
nativeOps.averageHalf(extras, x, target == null ? null : (ShortPointer) z, arrays.length, len, true);
}
allocator.getFlowController().registerAction(context, target, arrays);
tempX.address();
return target;
} else {
// otherwise we do averging on CPU side
/**
* We expect all operations are complete at this point
*/
long len = target == null ? arrays[0].lengthLong() : target.lengthLong();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
PointerPointer dataPointers = new PointerPointer(arrays.length);
PointerPointer extras = new // not used
PointerPointer(// not used
null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
for (int i = 0; i < arrays.length; i++) {
Nd4j.getCompressor().autoDecompress(arrays[i]);
if (arrays[i].elementWiseStride() != 1)
throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
if (arrays[i].lengthLong() != len)
throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
}
if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.averageDouble(extras, dataPointers, target == null ? null : (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
} else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.averageFloat(extras, dataPointers, target == null ? null : (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
} else {
nativeOps.averageHalf(extras, dataPointers, target == null ? null : (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
}
if (target != null)
AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
// TODO: make propagation optional maybe?
if (true) {
for (int i = 0; i < arrays.length; i++) {
AtomicAllocator.getInstance().getAllocationPoint(arrays[i]).tickHostWrite();
}
}
return target;
}
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class JcublasLapack method dsyev.
public int dsyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
int status = -1;
int jobz = _jobz == 'V' ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
int uplo = _uplo == 'L' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
log.warn("DOUBLE dsyev called in FLOAT environment");
INDArray a = A;
if (A.ordering() == 'c')
a = A.dup('f');
int M = A.rows();
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (status == 0) {
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xRPointer = new CublasPointer(R, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
status = cusolverDnDsyevd_bufferSize(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
if (status == CUSOLVER_STATUS_SUCCESS) {
int worksize = worksizeBuffer.getInt(0);
// allocate memory for the workspace, the non-converging row buffer and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
// Do the actual decomp
status = cusolverDnDsyevd(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
allocator.registerAction(ctx, INFO);
if (status == 0)
status = INFO.getInt(0);
}
}
}
if (status == 0) {
allocator.registerAction(ctx, R);
allocator.registerAction(ctx, a);
if (a != A)
A.assign(a);
}
return status;
}
Aggregations