use of org.bytedeco.javacpp.IntPointer in project nd4j by deeplearning4j.
the class JcublasLapack method dgeqrf.
public void dgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
INDArray a = A;
INDArray r = R;
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
log.warn("DOUBLE getrf called in FLOAT environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (R != null && R.ordering() == 'c')
r = R.dup('f');
INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createDouble(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }));
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xTauPointer = new CublasPointer(tau, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnDgeqrf_bufferSize(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
throw new BlasException("cusolverDnDgeqrf_bufferSize failed", stat);
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual QR decomp
stat = cusolverDnDgeqrf(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
throw new BlasException("cusolverDnDgeqrf failed", stat);
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, tau);
allocator.registerAction(ctx, INFO);
if (INFO.getInt(0) != 0) {
throw new BlasException("cusolverDnDgeqrf failed with info", INFO.getInt(0));
// Copy R ( upper part of Q ) into result
if (r != null) {
r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(0, i);
r.put(ix, 0);
stat = cusolverDnDorgqr_bufferSize(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
worksize = worksizeBuffer.getInt(0);
workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
stat = cusolverDnDorgqr(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
throw new BlasException("cusolverDnDorgqr failed", stat);
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
if (a != A)
if (r != null && r != R)
R.assign(r);"A: {}", A);
if (R != null)"R: {}", R);
use of org.bytedeco.javacpp.IntPointer in project nd4j by deeplearning4j.
the class JcublasLapack method dgesvd.
public void dgesvd(byte jobu, byte jobvt, int M, int N, INDArray A, INDArray S, INDArray U, INDArray VT, INDArray INFO) {
INDArray a = A;
INDArray u = U;
INDArray vt = VT;
// we should transpose & adjust outputs if M<N
// cuda has a limitation, but it's OK we know
// A = U S V'
// transpose multiply rules give us ...
// A' = V S' U'
boolean hadToTransposeA = false;
if (M < N) {
hadToTransposeA = true;
int tmp1 = N;
N = M;
M = tmp1;
a = A.transpose().dup('f');
u = VT.dup('f');
vt = U.dup('f');
} else {
// cuda requires column ordering - we'll register a warning in case
if (A.ordering() == 'c')
a = A.dup('f');
if (U != null && U.ordering() == 'c')
u = U.dup('f');
if (VT != null && VT.ordering() == 'c')
vt = VT.dup('f');
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
log.warn("DOUBLE gesvd called in FLOAT environment");
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgesvd_bufferSize(// we intentionally use host pointer here
solverDn, // we intentionally use host pointer here
M, // we intentionally use host pointer here
N, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
throw new BlasException("cusolverDnSgesvd_bufferSize failed", stat);
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the non-converging row buffer and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
DataBuffer rwork = Nd4j.getDataBufferFactory().createDouble((M < N ? M : N) - 1);
// Do the actual decomp
stat = cusolverDnDgesvd(solverDn, jobu, jobvt, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, new CudaPointer(allocator.getPointer(S, ctx)).asDoublePointer(), U == null ? null : new CudaPointer(allocator.getPointer(u, ctx)).asDoublePointer(), M, VT == null ? null : new CudaPointer(allocator.getPointer(vt, ctx)).asDoublePointer(), N, new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(rwork, ctx)).asDoublePointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
throw new BlasException("cusolverDnDgesvd failed" + stat);
allocator.registerAction(ctx, INFO);
allocator.registerAction(ctx, S);
allocator.registerAction(ctx, a);
if (U != null)
allocator.registerAction(ctx, u);
if (VT != null)
allocator.registerAction(ctx, vt);
// if we transposed A then swap & transpose U & V'
if (hadToTransposeA) {
} else {
if (u != U)
if (vt != VT)
use of org.bytedeco.javacpp.IntPointer in project nd4j by deeplearning4j.
the class JcublasLapack method dgetrf.
public void dgetrf(int M, int N, INDArray A, INDArray IPIV, INDArray INFO) {
INDArray a = A;
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
log.warn("FLOAT getrf called in FLOAT environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnDgetrf_bufferSize(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
throw new BlasException("cusolverDnDgetrf_bufferSize failed", stat);
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual LU decomp
stat = cusolverDnDgetrf(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, new CudaPointer(workspace).asDoublePointer(), new CudaPointer(allocator.getPointer(IPIV, ctx)).asIntPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
throw new BlasException("cusolverDnSgetrf failed", stat);
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
allocator.registerAction(ctx, IPIV);
if (a != A)
use of org.bytedeco.javacpp.IntPointer in project nd4j by deeplearning4j.
the class JcublasLapack method ssyev.
public int ssyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
int status = -1;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT ssyev called in DOUBLE environment");
INDArray a = A;
if (A.ordering() == 'c')
a = A.dup('f');
int M = A.rows();
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (status == 0) {
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xRPointer = new CublasPointer(R, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
status = cusolverDnSsyevd_bufferSize(solverDn, jobz, uplo, M, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
int worksize = worksizeBuffer.getInt(0);
// allocate memory for the workspace, the non-converging row buffer and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
// Do the actual decomp
status = cusolverDnSsyevd(solverDn, jobz, uplo, M, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
allocator.registerAction(ctx, INFO);
if (status == 0)
status = INFO.getInt(0);
if (status == 0) {
allocator.registerAction(ctx, R);
allocator.registerAction(ctx, a);
if (a != A)
return status;
use of org.bytedeco.javacpp.IntPointer in project nd4j by deeplearning4j.
the class JcublasLapack method sgetrf.
public void sgetrf(int M, int N, INDArray A, INDArray IPIV, INDArray INFO) {
INDArray a = A;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT getrf called in DOUBLE environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgetrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
throw new BlasException("cusolverDnSgetrf_bufferSize failed", stat);
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual LU decomp
stat = cusolverDnSgetrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(workspace).asFloatPointer(), new CudaPointer(allocator.getPointer(IPIV, ctx)).asIntPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
throw new BlasException("cusolverDnSgetrf failed", stat);
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
allocator.registerAction(ctx, IPIV);
if (a != A)