Search in sources :

Example 21 with FloatPointer

use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.

the class JcublasLevel3 method sgemm.

@Override
protected void sgemm(char Order, char TransA, char TransB, int M, int N, int K, float alpha, INDArray A, int lda, INDArray B, int ldb, float beta, INDArray C, int ldc) {
    // B = Shape.toOffsetZero(B);
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        logger.warn("FLOAT gemm called");
    Nd4j.getExecutioner().push();
    CudaContext ctx = allocator.getFlowController().prepareAction(C, A, B);
    CublasPointer cAPointer = new CublasPointer(A, ctx);
    CublasPointer cBPointer = new CublasPointer(B, ctx);
    CublasPointer cCPointer = new CublasPointer(C, ctx);
    cublasHandle_t handle = ctx.getHandle();
    synchronized (handle) {
        cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
        cublasSgemm_v2(new cublasContext(handle), convertTranspose(TransA), convertTranspose(TransB), M, N, K, new FloatPointer(alpha), (FloatPointer) cAPointer.getDevicePointer(), lda, (FloatPointer) cBPointer.getDevicePointer(), ldb, new FloatPointer(beta), (FloatPointer) cCPointer.getDevicePointer(), ldc);
    }
    allocator.registerAction(ctx, C, A, B);
    OpExecutionerUtil.checkForAny(C);
}
Also used : org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t(org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer)

Example 22 with FloatPointer

use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.

the class JcublasLevel3 method ssyrk.

@Override
protected void ssyrk(char Order, char Uplo, char Trans, int N, int K, float alpha, INDArray A, int lda, float beta, INDArray C, int ldc) {
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        logger.warn("FLOAT syrk called");
    Nd4j.getExecutioner().push();
    CudaContext ctx = allocator.getFlowController().prepareAction(C, A);
    CublasPointer aPointer = new CublasPointer(A, ctx);
    CublasPointer cPointer = new CublasPointer(C, ctx);
    cublasHandle_t handle = ctx.getHandle();
    synchronized (handle) {
        cublasSetStream_v2(new cublasContext(handle), new CUstream_st(ctx.getOldStream()));
        cublasSsyrk_v2(new cublasContext(handle), convertUplo(Uplo), convertTranspose(Trans), N, K, new FloatPointer(alpha), (FloatPointer) aPointer.getDevicePointer(), lda, new FloatPointer(beta), (FloatPointer) cPointer.getDevicePointer(), ldc);
    }
    allocator.registerAction(ctx, C, A);
    OpExecutionerUtil.checkForAny(C);
}
Also used : org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t(org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer)

Example 23 with FloatPointer

use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.

the class JcublasLapack method sgeqrf.

// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
    INDArray a = A;
    INDArray r = R;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT getrf called in DOUBLE environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (R != null && R.ordering() == 'c')
        r = R.dup('f');
    INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new IllegalStateException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        CublasPointer xTauPointer = new CublasPointer(tau, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual QR decomp
        stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf failed", stat);
        }
        allocator.registerAction(ctx, a);
        // allocator.registerAction(ctx, tau);
        allocator.registerAction(ctx, INFO);
        if (INFO.getInt(0) != 0) {
            throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
        }
        // Copy R ( upper part of Q ) into result
        if (r != null) {
            r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
            INDArrayIndex[] ix = new INDArrayIndex[2];
            for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
                ix[0] = NDArrayIndex.point(i);
                ix[1] = NDArrayIndex.interval(0, i);
                r.put(ix, 0);
            }
        }
        stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
        worksize = worksizeBuffer.getInt(0);
        workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSorgqr failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (r != null && r != R)
        R.assign(r);
    log.info("A: {}", A);
    if (R != null)
        log.info("R: {}", R);
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 24 with FloatPointer

use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.

the class JcublasLapack method ssyev.

public int ssyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
    int status = -1;
    int jobz = _jobz == 'V' ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
    int uplo = _uplo == 'L' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT ssyev called in DOUBLE environment");
    INDArray a = A;
    if (A.ordering() == 'c')
        a = A.dup('f');
    int M = A.rows();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (status == 0) {
            // transfer the INDArray into GPU memory
            CublasPointer xAPointer = new CublasPointer(a, ctx);
            CublasPointer xRPointer = new CublasPointer(R, ctx);
            // this output - indicates how much memory we'll need for the real operation
            DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
            status = cusolverDnSsyevd_bufferSize(solverDn, jobz, uplo, M, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
            if (status == CUSOLVER_STATUS_SUCCESS) {
                int worksize = worksizeBuffer.getInt(0);
                // allocate memory for the workspace, the non-converging row buffer and a return code
                Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
                INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
                // Do the actual decomp
                status = cusolverDnSsyevd(solverDn, jobz, uplo, M, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
                allocator.registerAction(ctx, INFO);
                if (status == 0)
                    status = INFO.getInt(0);
            }
        }
    }
    if (status == 0) {
        allocator.registerAction(ctx, R);
        allocator.registerAction(ctx, a);
        if (a != A)
            A.assign(a);
    }
    return status;
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 25 with FloatPointer

use of org.bytedeco.javacpp.FloatPointer in project nd4j by deeplearning4j.

the class JcublasLapack method sgetrf.

@Override
public void sgetrf(int M, int N, INDArray A, INDArray IPIV, INDArray INFO) {
    INDArray a = A;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT getrf called in DOUBLE environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgetrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgetrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual LU decomp
        stat = cusolverDnSgetrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(workspace).asFloatPointer(), new CudaPointer(allocator.getPointer(IPIV, ctx)).asIntPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgetrf failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    allocator.registerAction(ctx, IPIV);
    if (a != A)
        A.assign(a);
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

FloatPointer (org.bytedeco.javacpp.FloatPointer)30 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)15 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)14 IntPointer (org.bytedeco.javacpp.IntPointer)11 INDArray (org.nd4j.linalg.api.ndarray.INDArray)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)9 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)9 org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t (org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t)9 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)7 Pointer (org.bytedeco.javacpp.Pointer)6 BlasException (org.nd4j.linalg.api.blas.BlasException)6 BytePointer (org.bytedeco.javacpp.BytePointer)5 ShortPointer (org.bytedeco.javacpp.ShortPointer)5 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)5 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)5 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)5 ByteBuffer (java.nio.ByteBuffer)4 DoubleBuffer (java.nio.DoubleBuffer)4 FloatBuffer (java.nio.FloatBuffer)4 IntBuffer (java.nio.IntBuffer)4