Search in sources :

Example 21 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class GridExecutionerTest method testOpPointerizeReduce2.

/**
 * Reduce along all dimensions
 *
 * @throws Exception
 */
@Test
public void testOpPointerizeReduce2() throws Exception {
    CudaGridExecutioner executioner = new CudaGridExecutioner();
    INDArray array = Nd4j.create(10, 10);
    Sum opA = new Sum(array);
    // we need exec here, to init Op.Z for specific dimension
    executioner.exec(opA);
    GridPointers pointers = executioner.pointerizeOp(opA, null);
    assertEquals(opA.opNum(), pointers.getOpNum());
    assertEquals(Op.Type.REDUCE, pointers.getType());
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    Pointer x = AtomicAllocator.getInstance().getPointer(array, context);
    Pointer xShapeInfo = AtomicAllocator.getInstance().getPointer(array.shapeInfoDataBuffer(), context);
    Pointer z = AtomicAllocator.getInstance().getPointer(opA.z(), context);
    Pointer zShapeInfo = AtomicAllocator.getInstance().getPointer(opA.z().shapeInfoDataBuffer(), context);
    DataBuffer dimBuff = Nd4j.getConstantHandler().getConstantBuffer(new int[] { 1 });
    Pointer ptrBuff = AtomicAllocator.getInstance().getPointer(dimBuff, context);
    assertEquals(x, pointers.getX());
    assertEquals(null, pointers.getY());
    assertNotEquals(null, pointers.getZ());
    assertEquals(z, pointers.getZ());
    assertEquals(1, opA.z().length());
    assertEquals(1, pointers.getZLength());
    /*      // We dont really care about EWS here, since we're testing TAD-based operation

        assertEquals(1, pointers.getXStride());
        assertEquals(-1, pointers.getYStride());
        assertEquals(1, pointers.getZStride());
*/
    assertEquals(xShapeInfo, pointers.getXShapeInfo());
    assertEquals(null, pointers.getYShapeInfo());
    assertEquals(zShapeInfo, pointers.getZShapeInfo());
    assertEquals(null, pointers.getDimensions());
    assertEquals(0, pointers.getDimensionsLength());
    assertEquals(null, pointers.getTadShape());
    assertEquals(null, pointers.getTadOffsets());
    assertEquals(null, pointers.getExtraArgs());
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) GridPointers(org.nd4j.linalg.api.ops.grid.GridPointers) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) Sum(org.nd4j.linalg.api.ops.impl.accum.Sum) Pointer(org.bytedeco.javacpp.Pointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) Test(org.junit.Test)

Example 22 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class CudaCachingZeroProvider method malloc.

/**
 * This method provides PointersPair to memory chunk specified by AllocationShape
 *
 * PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
 *
 * @param shape shape of desired memory chunk
 * @param point target AllocationPoint structure
 * @param location either HOST or DEVICE
 * @return
 */
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
    long reqMemory = AllocationUtils.getRequiredMemory(shape);
    if (location == AllocationStatus.HOST && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength()) {
        CacheHolder cache = zeroCache.get(shape);
        if (cache != null) {
            Pointer pointer = cache.poll();
            if (pointer != null) {
                cacheZeroHit.incrementAndGet();
                // since this memory chunk is going to be used now, remove it's amount from
                zeroCachedAmount.addAndGet(-1 * reqMemory);
                PointersPair pair = new PointersPair();
                pair.setDevicePointer(new CudaPointer(pointer.address()));
                pair.setHostPointer(new CudaPointer(pointer.address()));
                point.setAllocationStatus(AllocationStatus.HOST);
                return pair;
            }
        }
        cacheZeroMiss.incrementAndGet();
        if (CudaEnvironment.getInstance().getConfiguration().isUsePreallocation() && zeroCachedAmount.get() < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache() / 10 && reqMemory < 16 * 1024 * 1024L) {
            CachePreallocator preallocator = new CachePreallocator(shape, location, CudaEnvironment.getInstance().getConfiguration().getPreallocationCalls());
            preallocator.start();
        }
        cacheZeroMiss.incrementAndGet();
        return super.malloc(shape, point, location);
    }
    return super.malloc(shape, point, location);
}
Also used : PointersPair(org.nd4j.jita.allocator.pointers.PointersPair) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 23 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class JcublasLapack method dsyev.

public int dsyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
    int status = -1;
    int jobz = _jobz == 'V' ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
    int uplo = _uplo == 'L' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
    if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
        log.warn("DOUBLE dsyev called in FLOAT environment");
    INDArray a = A;
    if (A.ordering() == 'c')
        a = A.dup('f');
    int M = A.rows();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (status == 0) {
            // transfer the INDArray into GPU memory
            CublasPointer xAPointer = new CublasPointer(a, ctx);
            CublasPointer xRPointer = new CublasPointer(R, ctx);
            // this output - indicates how much memory we'll need for the real operation
            DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
            status = cusolverDnDsyevd_bufferSize(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
            if (status == CUSOLVER_STATUS_SUCCESS) {
                int worksize = worksizeBuffer.getInt(0);
                // allocate memory for the workspace, the non-converging row buffer and a return code
                Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
                INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
                // Do the actual decomp
                status = cusolverDnDsyevd(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
                allocator.registerAction(ctx, INFO);
                if (status == 0)
                    status = INFO.getInt(0);
            }
        }
    }
    if (status == 0) {
        allocator.registerAction(ctx, R);
        allocator.registerAction(ctx, a);
        if (a != A)
            A.assign(a);
    }
    return status;
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 24 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class JcublasLapack method spotrf.

// =========================
// CHOLESKY DECOMP
@Override
public void spotrf(byte uplo, int N, INDArray A, INDArray INFO) {
    INDArray a = A;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("DOUBLE potrf called in FLOAT environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSpotrf_bufferSize(solverDn, uplo, N, (FloatPointer) xAPointer.getDevicePointer(), N, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSpotrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual decomp
        stat = cusolverDnSpotrf(solverDn, uplo, N, (FloatPointer) xAPointer.getDevicePointer(), N, new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSpotrf failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (uplo == 'U') {
        A.assign(A.transpose());
        INDArrayIndex[] ix = new INDArrayIndex[2];
        for (int i = 1; i < Math.min(A.rows(), A.columns()); i++) {
            ix[0] = NDArrayIndex.point(i);
            ix[1] = NDArrayIndex.interval(0, i);
            A.put(ix, 0);
        }
    } else {
        INDArrayIndex[] ix = new INDArrayIndex[2];
        for (int i = 0; i < Math.min(A.rows(), A.columns() - 1); i++) {
            ix[0] = NDArrayIndex.point(i);
            ix[1] = NDArrayIndex.interval(i + 1, A.columns());
            A.put(ix, 0);
        }
    }
    log.info("A: {}", A);
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 25 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class JcublasLapack method sgesvd.

@Override
public void sgesvd(byte jobu, byte jobvt, int M, int N, INDArray A, INDArray S, INDArray U, INDArray VT, INDArray INFO) {
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT gesvd called in DOUBLE environment");
    INDArray a = A;
    INDArray u = U;
    INDArray vt = VT;
    // we should transpose & adjust outputs if M<N
    // cuda has a limitation, but it's OK we know
    // A = U S V'
    // transpose multiply rules give us ...
    // A' = V S' U'
    boolean hadToTransposeA = false;
    if (M < N) {
        hadToTransposeA = true;
        int tmp1 = N;
        N = M;
        M = tmp1;
        a = A.transpose().dup('f');
        u = VT.dup('f');
        vt = U.dup('f');
    } else {
        // cuda requires column ordering - we'll register a warning in case
        if (A.ordering() == 'c')
            a = A.dup('f');
        if (U != null && U.ordering() == 'c')
            u = U.dup('f');
        if (VT != null && VT.ordering() == 'c')
            vt = VT.dup('f');
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgesvd_bufferSize(// we intentionally use host pointer here
        solverDn, // we intentionally use host pointer here
        M, // we intentionally use host pointer here
        N, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgesvd_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        DataBuffer rwork = Nd4j.getDataBufferFactory().createFloat((M < N ? M : N) - 1);
        // Do the actual decomp
        stat = cusolverDnSgesvd(solverDn, jobu, jobvt, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, new CudaPointer(allocator.getPointer(S, ctx)).asFloatPointer(), U == null ? null : new CudaPointer(allocator.getPointer(u, ctx)).asFloatPointer(), M, VT == null ? null : new CudaPointer(allocator.getPointer(vt, ctx)).asFloatPointer(), N, new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(rwork, ctx)).asFloatPointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgesvd failed", stat);
        }
    }
    allocator.registerAction(ctx, INFO);
    allocator.registerAction(ctx, S);
    if (U != null)
        allocator.registerAction(ctx, u);
    if (VT != null)
        allocator.registerAction(ctx, vt);
    // if we transposed A then swap & transpose U & V'
    if (hadToTransposeA) {
        U.assign(vt.transpose());
        VT.assign(u.transpose());
    } else {
        if (u != U)
            U.assign(u);
        if (vt != VT)
            VT.assign(vt);
    }
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

Pointer (org.bytedeco.javacpp.Pointer)61 FloatPointer (org.bytedeco.javacpp.FloatPointer)29 DoublePointer (org.bytedeco.javacpp.DoublePointer)27 IntPointer (org.bytedeco.javacpp.IntPointer)23 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)23 INDArray (org.nd4j.linalg.api.ndarray.INDArray)21 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)19 BytePointer (org.bytedeco.javacpp.BytePointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 ShortPointer (org.bytedeco.javacpp.ShortPointer)16 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)16 PointerPointer (org.bytedeco.javacpp.PointerPointer)11 ByteBuffer (java.nio.ByteBuffer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 FunctionPointer (org.bytedeco.javacpp.FunctionPointer)9 BoolPointer (org.bytedeco.javacpp.BoolPointer)8 CLongPointer (org.bytedeco.javacpp.CLongPointer)8 CharPointer (org.bytedeco.javacpp.CharPointer)8