use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class GridExecutionerTest method testOpPointerizeReduce2.
/**
* Reduce along all dimensions
*
* @throws Exception
*/
@Test
public void testOpPointerizeReduce2() throws Exception {
CudaGridExecutioner executioner = new CudaGridExecutioner();
INDArray array = Nd4j.create(10, 10);
Sum opA = new Sum(array);
// we need exec here, to init Op.Z for specific dimension
executioner.exec(opA);
GridPointers pointers = executioner.pointerizeOp(opA, null);
assertEquals(opA.opNum(), pointers.getOpNum());
assertEquals(Op.Type.REDUCE, pointers.getType());
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
Pointer x = AtomicAllocator.getInstance().getPointer(array, context);
Pointer xShapeInfo = AtomicAllocator.getInstance().getPointer(array.shapeInfoDataBuffer(), context);
Pointer z = AtomicAllocator.getInstance().getPointer(opA.z(), context);
Pointer zShapeInfo = AtomicAllocator.getInstance().getPointer(opA.z().shapeInfoDataBuffer(), context);
DataBuffer dimBuff = Nd4j.getConstantHandler().getConstantBuffer(new int[] { 1 });
Pointer ptrBuff = AtomicAllocator.getInstance().getPointer(dimBuff, context);
assertEquals(x, pointers.getX());
assertEquals(null, pointers.getY());
assertNotEquals(null, pointers.getZ());
assertEquals(z, pointers.getZ());
assertEquals(1, opA.z().length());
assertEquals(1, pointers.getZLength());
/* // We dont really care about EWS here, since we're testing TAD-based operation
assertEquals(1, pointers.getXStride());
assertEquals(-1, pointers.getYStride());
assertEquals(1, pointers.getZStride());
*/
assertEquals(xShapeInfo, pointers.getXShapeInfo());
assertEquals(null, pointers.getYShapeInfo());
assertEquals(zShapeInfo, pointers.getZShapeInfo());
assertEquals(null, pointers.getDimensions());
assertEquals(0, pointers.getDimensionsLength());
assertEquals(null, pointers.getTadShape());
assertEquals(null, pointers.getTadOffsets());
assertEquals(null, pointers.getExtraArgs());
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class CudaCachingZeroProvider method malloc.
/**
* This method provides PointersPair to memory chunk specified by AllocationShape
*
* PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
*
* @param shape shape of desired memory chunk
* @param point target AllocationPoint structure
* @param location either HOST or DEVICE
* @return
*/
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
long reqMemory = AllocationUtils.getRequiredMemory(shape);
if (location == AllocationStatus.HOST && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength()) {
CacheHolder cache = zeroCache.get(shape);
if (cache != null) {
Pointer pointer = cache.poll();
if (pointer != null) {
cacheZeroHit.incrementAndGet();
// since this memory chunk is going to be used now, remove it's amount from
zeroCachedAmount.addAndGet(-1 * reqMemory);
PointersPair pair = new PointersPair();
pair.setDevicePointer(new CudaPointer(pointer.address()));
pair.setHostPointer(new CudaPointer(pointer.address()));
point.setAllocationStatus(AllocationStatus.HOST);
return pair;
}
}
cacheZeroMiss.incrementAndGet();
if (CudaEnvironment.getInstance().getConfiguration().isUsePreallocation() && zeroCachedAmount.get() < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache() / 10 && reqMemory < 16 * 1024 * 1024L) {
CachePreallocator preallocator = new CachePreallocator(shape, location, CudaEnvironment.getInstance().getConfiguration().getPreallocationCalls());
preallocator.start();
}
cacheZeroMiss.incrementAndGet();
return super.malloc(shape, point, location);
}
return super.malloc(shape, point, location);
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class JcublasLapack method dsyev.
public int dsyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
int status = -1;
int jobz = _jobz == 'V' ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
int uplo = _uplo == 'L' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
log.warn("DOUBLE dsyev called in FLOAT environment");
INDArray a = A;
if (A.ordering() == 'c')
a = A.dup('f');
int M = A.rows();
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (status == 0) {
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xRPointer = new CublasPointer(R, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
status = cusolverDnDsyevd_bufferSize(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
if (status == CUSOLVER_STATUS_SUCCESS) {
int worksize = worksizeBuffer.getInt(0);
// allocate memory for the workspace, the non-converging row buffer and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
// Do the actual decomp
status = cusolverDnDsyevd(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
allocator.registerAction(ctx, INFO);
if (status == 0)
status = INFO.getInt(0);
}
}
}
if (status == 0) {
allocator.registerAction(ctx, R);
allocator.registerAction(ctx, a);
if (a != A)
A.assign(a);
}
return status;
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class JcublasLapack method spotrf.
// =========================
// CHOLESKY DECOMP
@Override
public void spotrf(byte uplo, int N, INDArray A, INDArray INFO) {
INDArray a = A;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("DOUBLE potrf called in FLOAT environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSpotrf_bufferSize(solverDn, uplo, N, (FloatPointer) xAPointer.getDevicePointer(), N, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSpotrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual decomp
stat = cusolverDnSpotrf(solverDn, uplo, N, (FloatPointer) xAPointer.getDevicePointer(), N, new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSpotrf failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
if (a != A)
A.assign(a);
if (uplo == 'U') {
A.assign(A.transpose());
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 1; i < Math.min(A.rows(), A.columns()); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(0, i);
A.put(ix, 0);
}
} else {
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 0; i < Math.min(A.rows(), A.columns() - 1); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(i + 1, A.columns());
A.put(ix, 0);
}
}
log.info("A: {}", A);
}
use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.
the class JcublasLapack method sgesvd.
@Override
public void sgesvd(byte jobu, byte jobvt, int M, int N, INDArray A, INDArray S, INDArray U, INDArray VT, INDArray INFO) {
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT gesvd called in DOUBLE environment");
INDArray a = A;
INDArray u = U;
INDArray vt = VT;
// we should transpose & adjust outputs if M<N
// cuda has a limitation, but it's OK we know
// A = U S V'
// transpose multiply rules give us ...
// A' = V S' U'
boolean hadToTransposeA = false;
if (M < N) {
hadToTransposeA = true;
int tmp1 = N;
N = M;
M = tmp1;
a = A.transpose().dup('f');
u = VT.dup('f');
vt = U.dup('f');
} else {
// cuda requires column ordering - we'll register a warning in case
if (A.ordering() == 'c')
a = A.dup('f');
if (U != null && U.ordering() == 'c')
u = U.dup('f');
if (VT != null && VT.ordering() == 'c')
vt = VT.dup('f');
}
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgesvd_bufferSize(// we intentionally use host pointer here
solverDn, // we intentionally use host pointer here
M, // we intentionally use host pointer here
N, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgesvd_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
DataBuffer rwork = Nd4j.getDataBufferFactory().createFloat((M < N ? M : N) - 1);
// Do the actual decomp
stat = cusolverDnSgesvd(solverDn, jobu, jobvt, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(allocator.getPointer(S, ctx)).asFloatPointer(), U == null ? null : new CudaPointer(allocator.getPointer(u, ctx)).asFloatPointer(), M, VT == null ? null : new CudaPointer(allocator.getPointer(vt, ctx)).asFloatPointer(), N, new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(rwork, ctx)).asFloatPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgesvd failed", stat);
}
}
allocator.registerAction(ctx, INFO);
allocator.registerAction(ctx, S);
if (U != null)
allocator.registerAction(ctx, u);
if (VT != null)
allocator.registerAction(ctx, vt);
// if we transposed A then swap & transpose U & V'
if (hadToTransposeA) {
U.assign(vt.transpose());
VT.assign(u.transpose());
} else {
if (u != U)
U.assign(u);
if (vt != VT)
VT.assign(vt);
}
}
Aggregations