use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.
the class JcublasLevel1 method sscal.
@Override
protected void sscal(int N, float alpha, INDArray X, int incX) {
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
logger.warn("FLOAT scal called");
Nd4j.getExecutioner().push();
CudaContext ctx = allocator.getFlowController().prepareAction(X);
CublasPointer xCPointer = new CublasPointer(X, ctx);
cublasHandle_t handle = ctx.getHandle();
synchronized (handle) {
cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
cublasSscal_v2(new cublasContext(handle), N, new FloatPointer(alpha), (FloatPointer) xCPointer.getDevicePointer(), incX);
}
allocator.registerAction(ctx, X);
OpExecutionerUtil.checkForAny(X);
}
use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.
the class JcublasLevel1 method ddot.
@Override
protected double ddot(int N, INDArray X, int incX, INDArray Y, int incY) {
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
logger.warn("DOUBLE dot called");
Nd4j.getExecutioner().push();
double ret;
CudaContext ctx = allocator.getFlowController().prepareAction(null, X, Y);
CublasPointer xCPointer = new CublasPointer(X, ctx);
CublasPointer yCPointer = new CublasPointer(Y, ctx);
cublasHandle_t handle = ctx.getHandle();
synchronized (handle) {
cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
DoublePointer resultPointer = new DoublePointer(0.0);
cublasDdot_v2(new cublasContext(handle), N, (DoublePointer) xCPointer.getDevicePointer(), incX, (DoublePointer) yCPointer.getDevicePointer(), incY, resultPointer);
ret = resultPointer.get();
}
allocator.registerAction(ctx, null, X, Y);
return ret;
}
use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.
the class JcublasLevel1 method snrm2.
@Override
protected float snrm2(int N, INDArray X, int incX) {
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
logger.warn("FLOAT nrm2 called");
Nd4j.getExecutioner().push();
CudaContext ctx = allocator.getFlowController().prepareAction(null, X);
float ret;
CublasPointer cAPointer = new CublasPointer(X, ctx);
cublasHandle_t handle = ctx.getHandle();
synchronized (handle) {
cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
FloatPointer resultPointer = new FloatPointer(0.0f);
cublasSnrm2_v2(new cublasContext(handle), N, (FloatPointer) cAPointer.getDevicePointer(), incX, resultPointer);
ret = resultPointer.get();
}
allocator.registerAction(ctx, null, X);
return ret;
}
use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.
the class JcublasLapack method sgeqrf.
// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
INDArray a = A;
INDArray r = R;
if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
log.warn("FLOAT getrf called in DOUBLE environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (R != null && R.ordering() == 'c')
r = R.dup('f');
INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new IllegalStateException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xTauPointer = new CublasPointer(tau, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual QR decomp
stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSgeqrf failed", stat);
}
allocator.registerAction(ctx, a);
// allocator.registerAction(ctx, tau);
allocator.registerAction(ctx, INFO);
if (INFO.getInt(0) != 0) {
throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
}
// Copy R ( upper part of Q ) into result
if (r != null) {
r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(0, i);
r.put(ix, 0);
}
}
stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
worksize = worksizeBuffer.getInt(0);
workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnSorgqr failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
if (a != A)
A.assign(a);
if (r != null && r != R)
R.assign(r);
log.info("A: {}", A);
if (R != null)
log.info("R: {}", R);
}
use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.
the class JcublasLapack method dgeqrf.
@Override
public void dgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
INDArray a = A;
INDArray r = R;
if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
log.warn("DOUBLE getrf called in FLOAT environment");
if (A.ordering() == 'c')
a = A.dup('f');
if (R != null && R.ordering() == 'c')
r = R.dup('f');
INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createDouble(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }));
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
// Get context for current thread
CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
// setup the solver handles for cuSolver calls
cusolverDnHandle_t handle = ctx.getSolverHandle();
cusolverDnContext solverDn = new cusolverDnContext(handle);
// synchronized on the solver
synchronized (handle) {
int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
if (result != 0)
throw new BlasException("solverSetStream failed");
// transfer the INDArray into GPU memory
CublasPointer xAPointer = new CublasPointer(a, ctx);
CublasPointer xTauPointer = new CublasPointer(tau, ctx);
// this output - indicates how much memory we'll need for the real operation
DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
int stat = cusolverDnDgeqrf_bufferSize(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
(IntPointer) worksizeBuffer.addressPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnDgeqrf_bufferSize failed", stat);
}
int worksize = worksizeBuffer.getInt(0);
// Now allocate memory for the workspace, the permutation matrix and a return code
Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
// Do the actual QR decomp
stat = cusolverDnDgeqrf(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnDgeqrf failed", stat);
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, tau);
allocator.registerAction(ctx, INFO);
if (INFO.getInt(0) != 0) {
throw new BlasException("cusolverDnDgeqrf failed with info", INFO.getInt(0));
}
// Copy R ( upper part of Q ) into result
if (r != null) {
r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
INDArrayIndex[] ix = new INDArrayIndex[2];
for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
ix[0] = NDArrayIndex.point(i);
ix[1] = NDArrayIndex.interval(0, i);
r.put(ix, 0);
}
}
stat = cusolverDnDorgqr_bufferSize(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
worksize = worksizeBuffer.getInt(0);
workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
stat = cusolverDnDorgqr(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
if (stat != CUSOLVER_STATUS_SUCCESS) {
throw new BlasException("cusolverDnDorgqr failed", stat);
}
}
allocator.registerAction(ctx, a);
allocator.registerAction(ctx, INFO);
if (a != A)
A.assign(a);
if (r != null && r != R)
R.assign(r);
log.info("A: {}", A);
if (R != null)
log.info("R: {}", R);
}
Aggregations