Examples with CUstream_st - org.bytedeco.javacpp.cuda.CUstream_st

Example 16 with CUstream_st

use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.

the class JcublasLevel1 method sscal.

@Override
protected void sscal(int N, float alpha, INDArray X, int incX) {
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        logger.warn("FLOAT scal called");
    Nd4j.getExecutioner().push();
    CudaContext ctx = allocator.getFlowController().prepareAction(X);
    CublasPointer xCPointer = new CublasPointer(X, ctx);
    cublasHandle_t handle = ctx.getHandle();
    synchronized (handle) {
        cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
        cublasSscal_v2(new cublasContext(handle), N, new FloatPointer(alpha), (FloatPointer) xCPointer.getDevicePointer(), incX);
    }
    allocator.registerAction(ctx, X);
    OpExecutionerUtil.checkForAny(X);
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t(org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer)

Example 17 with CUstream_st

use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.

the class JcublasLevel1 method ddot.

@Override
protected double ddot(int N, INDArray X, int incX, INDArray Y, int incY) {
    if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
        logger.warn("DOUBLE dot called");
    Nd4j.getExecutioner().push();
    double ret;
    CudaContext ctx = allocator.getFlowController().prepareAction(null, X, Y);
    CublasPointer xCPointer = new CublasPointer(X, ctx);
    CublasPointer yCPointer = new CublasPointer(Y, ctx);
    cublasHandle_t handle = ctx.getHandle();
    synchronized (handle) {
        cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
        DoublePointer resultPointer = new DoublePointer(0.0);
        cublasDdot_v2(new cublasContext(handle), N, (DoublePointer) xCPointer.getDevicePointer(), incX, (DoublePointer) yCPointer.getDevicePointer(), incY, resultPointer);
        ret = resultPointer.get();
    }
    allocator.registerAction(ctx, null, X, Y);
    return ret;
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t(org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer)

Example 18 with CUstream_st

use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.

the class JcublasLevel1 method snrm2.

@Override
protected float snrm2(int N, INDArray X, int incX) {
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        logger.warn("FLOAT nrm2 called");
    Nd4j.getExecutioner().push();
    CudaContext ctx = allocator.getFlowController().prepareAction(null, X);
    float ret;
    CublasPointer cAPointer = new CublasPointer(X, ctx);
    cublasHandle_t handle = ctx.getHandle();
    synchronized (handle) {
        cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
        FloatPointer resultPointer = new FloatPointer(0.0f);
        cublasSnrm2_v2(new cublasContext(handle), N, (FloatPointer) cAPointer.getDevicePointer(), incX, resultPointer);
        ret = resultPointer.get();
    }
    allocator.registerAction(ctx, null, X);
    return ret;
}

Example 19 with CUstream_st

use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.

the class JcublasLapack method sgeqrf.

// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
    INDArray a = A;
    INDArray r = R;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT getrf called in DOUBLE environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (R != null && R.ordering() == 'c')
        r = R.dup('f');
    INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new IllegalStateException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        CublasPointer xTauPointer = new CublasPointer(tau, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual QR decomp
        stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf failed", stat);
        }
        allocator.registerAction(ctx, a);
        // allocator.registerAction(ctx, tau);
        allocator.registerAction(ctx, INFO);
        if (INFO.getInt(0) != 0) {
            throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
        }
        // Copy R ( upper part of Q ) into result
        if (r != null) {
            r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
            INDArrayIndex[] ix = new INDArrayIndex[2];
            for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
                ix[0] = NDArrayIndex.point(i);
                ix[1] = NDArrayIndex.interval(0, i);
                r.put(ix, 0);
            }
        }
        stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
        worksize = worksizeBuffer.getInt(0);
        workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSorgqr failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (r != null && r != R)
        R.assign(r);
    log.info("A: {}", A);
    if (R != null)
        log.info("R: {}", R);
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 20 with CUstream_st

use of org.bytedeco.javacpp.cuda.CUstream_st in project nd4j by deeplearning4j.

the class JcublasLapack method dgeqrf.

@Override
public void dgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
    INDArray a = A;
    INDArray r = R;
    if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
        log.warn("DOUBLE getrf called in FLOAT environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (R != null && R.ordering() == 'c')
        r = R.dup('f');
    INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createDouble(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }));
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        CublasPointer xTauPointer = new CublasPointer(tau, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnDgeqrf_bufferSize(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDgeqrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual QR decomp
        stat = cusolverDnDgeqrf(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDgeqrf failed", stat);
        }
        allocator.registerAction(ctx, a);
        allocator.registerAction(ctx, tau);
        allocator.registerAction(ctx, INFO);
        if (INFO.getInt(0) != 0) {
            throw new BlasException("cusolverDnDgeqrf failed with info", INFO.getInt(0));
        }
        // Copy R ( upper part of Q ) into result
        if (r != null) {
            r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
            INDArrayIndex[] ix = new INDArrayIndex[2];
            for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
                ix[0] = NDArrayIndex.point(i);
                ix[1] = NDArrayIndex.interval(0, i);
                r.put(ix, 0);
            }
        }
        stat = cusolverDnDorgqr_bufferSize(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
        worksize = worksizeBuffer.getInt(0);
        workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        stat = cusolverDnDorgqr(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDorgqr failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (r != null && r != R)
        R.assign(r);
    log.info("A: {}", A);
    if (R != null)
        log.info("R: {}", R);
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)24 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)24 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)24 DoublePointer (org.bytedeco.javacpp.DoublePointer)14 FloatPointer (org.bytedeco.javacpp.FloatPointer)14 org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t (org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t)14 IntPointer (org.bytedeco.javacpp.IntPointer)12 Pointer (org.bytedeco.javacpp.Pointer)10 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)10 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)10 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)10 INDArray (org.nd4j.linalg.api.ndarray.INDArray)10 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4