use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.
the class JcublasLevel3 method sgemm.
@Override
protected void sgemm(char Order, char TransA, char TransB, int M, int N, int K, float alpha, INDArray A, int lda, INDArray B, int ldb, float beta, INDArray C, int ldc) {
// B = Shape.toOffsetZero(B);
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
logger.warn("FLOAT gemm called");
Nd4j.getExecutioner().push();
CudaContext ctx = allocator.getFlowController().prepareAction(C, A, B);
CublasPointer cAPointer = new CublasPointer(A, ctx);
CublasPointer cBPointer = new CublasPointer(B, ctx);
CublasPointer cCPointer = new CublasPointer(C, ctx);
cublasHandle_t handle = ctx.getHandle();
synchronized (handle) {
cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
cublasSgemm_v2(new cublasContext(handle), convertTranspose(TransA), convertTranspose(TransB), M, N, K, new FloatPointer(alpha), (FloatPointer) cAPointer.getDevicePointer(), lda, (FloatPointer) cBPointer.getDevicePointer(), ldb, new FloatPointer(beta), (FloatPointer) cCPointer.getDevicePointer(), ldc);
}
allocator.registerAction(ctx, C, A, B);
OpExecutionerUtil.checkForAny(C);
}
use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.
the class JcublasLevel3 method ssyrk.
@Override
protected void ssyrk(char Order, char Uplo, char Trans, int N, int K, float alpha, INDArray A, int lda, float beta, INDArray C, int ldc) {
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
logger.warn("FLOAT syrk called");
Nd4j.getExecutioner().push();
CudaContext ctx = allocator.getFlowController().prepareAction(C, A);
CublasPointer aPointer = new CublasPointer(A, ctx);
CublasPointer cPointer = new CublasPointer(C, ctx);
cublasHandle_t handle = ctx.getHandle();
synchronized (handle) {
cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
cublasSsyrk_v2(new cublasContext(handle), convertUplo(Uplo), convertTranspose(Trans), N, K, new FloatPointer(alpha), (FloatPointer) aPointer.getDevicePointer(), lda, new FloatPointer(beta), (FloatPointer) cPointer.getDevicePointer(), ldc);
}
allocator.registerAction(ctx, C, A);
OpExecutionerUtil.checkForAny(C);
}
use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.
the class JcublasLapack method sgeqrf.
// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
INDArray a = A;
INDArray r = R;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT getrf called in DOUBLE environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (R != null && R.ordering() == 'c')
r = R.dup('f');
INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new IllegalStateException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xTauPointer = new CublasPointer(tau, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual QR decomp
stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgeqrf failed", stat);
}
allocator.registerAction(ctx, a);
// allocator.registerAction(ctx, tau);
allocator.registerAction(ctx, INFO);
if (INFO.getInt(0) != 0) {
throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
}
// Copy R ( upper part of Q ) into result
if (r != null) {
r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(0, i);
r.put(ix, 0);
}
}
stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
worksize = worksizeBuffer.getInt(0);
workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSorgqr failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
if (a != A)
A.assign(a);
if (r != null && r != R)
R.assign(r);
log.info("A: {}", A);
if (R != null)
log.info("R: {}", R);
}
use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.
the class JcublasLapack method ssyev.
public int ssyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
int status = -1;
int jobz = _jobz == 'V' ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
int uplo = _uplo == 'L' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT ssyev called in DOUBLE environment");
INDArray a = A;
if (A.ordering() == 'c')
a = A.dup('f');
int M = A.rows();
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (status == 0) {
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xRPointer = new CublasPointer(R, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
status = cusolverDnSsyevd_bufferSize(solverDn, jobz, uplo, M, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
if (status == CUSOLVER_STATUS_SUCCESS) {
int worksize = worksizeBuffer.getInt(0);
// allocate memory for the workspace, the non-converging row buffer and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
// Do the actual decomp
status = cusolverDnSsyevd(solverDn, jobz, uplo, M, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
allocator.registerAction(ctx, INFO);
if (status == 0)
status = INFO.getInt(0);
}
}
}
if (status == 0) {
allocator.registerAction(ctx, R);
allocator.registerAction(ctx, a);
if (a != A)
A.assign(a);
}
return status;
}
use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.
the class JcublasLapack method sgetrf.
@Override
public void sgetrf(int M, int N, INDArray A, INDArray IPIV, INDArray INFO) {
INDArray a = A;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT getrf called in DOUBLE environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgetrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgetrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual LU decomp
stat = cusolverDnSgetrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(workspace).asFloatPointer(), new CudaPointer(allocator.getPointer(IPIV, ctx)).asIntPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgetrf failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
allocator.registerAction(ctx, IPIV);
if (a != A)
A.assign(a);
}
Aggregations