Search in sources :

Example 36 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class CudaZeroHandler method relocate.

/**
 * Copies specific chunk of memory from one storage to another
 *
 * Possible directions:  HOST -> DEVICE, DEVICE -> HOST
 *
 * @param currentStatus
 * @param targetStatus
 * @param point
 */
@Override
public void relocate(AllocationStatus currentStatus, AllocationStatus targetStatus, AllocationPoint point, AllocationShape shape, CudaContext context) {
    if (currentStatus == AllocationStatus.DEVICE && targetStatus == AllocationStatus.HOST) {
        // DEVICE -> HOST
        DataBuffer targetBuffer = point.getBuffer();
        if (targetBuffer == null)
            throw new IllegalStateException("Target buffer is NULL!");
        Pointer devicePointer = new CudaPointer(point.getPointers().getDevicePointer().address());
    } else if (currentStatus == AllocationStatus.HOST && targetStatus == AllocationStatus.DEVICE) {
        // TODO: this probably should be removed
        if (point.isConstant()) {
            // log.info("Skipping relocation for constant");
            return;
        }
        if (point.getPointers().getDevicePointer() == null) {
            throw new IllegalStateException("devicePointer is NULL!");
        }
        if (nativeOps.memcpyAsync(point.getPointers().getDevicePointer(), point.getPointers().getHostPointer(), AllocationUtils.getRequiredMemory(shape), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream()) == 0)
            throw new IllegalStateException("MemcpyAsync relocate H2D failed: [" + point.getHostPointer().address() + "] -> [" + point.getDevicePointer().address() + "]");
        flowController.commitTransfer(context.getSpecialStream());
    // 
    // 
    // context.syncOldStream();
    } else
        throw new UnsupportedOperationException("Can't relocate data in requested direction: [" + currentStatus + "] -> [" + targetStatus + "]");
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) BaseCudaDataBuffer(org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 37 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class CudaWorkspace method init.

@Override
protected void init() {
    if (workspaceConfiguration.getPolicyLocation() == LocationPolicy.MMAP) {
        throw new ND4JIllegalStateException("CUDA do not support MMAP workspaces yet");
    }
    super.init();
    if (currentSize.get() > 0) {
        // log.info("Allocating {} bytes at DEVICE & HOST space...", currentSize.get());
        isInit.set(true);
        long bytes = currentSize.get();
        if (isDebug.get())
            log.info("Allocating [{}] workspace on device_{}, {} bytes...", id, Nd4j.getAffinityManager().getDeviceForCurrentThread(), bytes);
        if (isDebug.get()) {
            Nd4j.getWorkspaceManager().printAllocationStatisticsForCurrentThread();
        }
        Pointer ptr = memoryManager.allocate((bytes + SAFETY_OFFSET), MemoryKind.HOST, false);
        if (ptr == null)
            throw new ND4JIllegalStateException("Can't allocate memory for workspace");
        workspace.setHostPointer(new PagedPointer(ptr));
        if (workspaceConfiguration.getPolicyMirroring() != MirroringPolicy.HOST_ONLY)
            workspace.setDevicePointer(new PagedPointer(memoryManager.allocate((bytes + SAFETY_OFFSET), MemoryKind.DEVICE, false)));
    // log.info("Workspace [{}] initialized successfully", id);
    }
}
Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) PagedPointer(org.nd4j.linalg.api.memory.pointers.PagedPointer) Pointer(org.bytedeco.javacpp.Pointer) PagedPointer(org.nd4j.linalg.api.memory.pointers.PagedPointer)

Example 38 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class CudaMemoryManager method memcpy.

/**
 * This method provides basic memcpy functionality with respect to target environment
 *
 * @param dstBuffer
 * @param srcBuffer
 */
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    if (dstBuffer instanceof CompressedDataBuffer && !(srcBuffer instanceof CompressedDataBuffer)) {
        // destination is compressed, source isn't
        AllocationPoint srcPoint = AtomicAllocator.getInstance().getAllocationPoint(srcBuffer);
        long size = srcBuffer.getElementSize() * srcBuffer.length();
        if (!srcPoint.isActualOnHostSide()) {
            // copying device -> host
            AtomicAllocator.getInstance().synchronizeHostData(srcBuffer);
        // Pointer src = AtomicAllocator.getInstance().getPointer(srcBuffer, context);
        // NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstBuffer.addressPointer(), src, size, 2, context.getSpecialStream());
        // context.syncSpecialStream();
        }
        // else {
        // copying host -> host
        Pointer src = AtomicAllocator.getInstance().getHostPointer(srcBuffer);
        Pointer.memcpy(dstBuffer.addressPointer(), src, size);
    // }
    } else if (!(dstBuffer instanceof CompressedDataBuffer) && srcBuffer instanceof CompressedDataBuffer) {
        // destination is NOT compressed, source is compressed
        AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(dstBuffer);
        long size = srcBuffer.getElementSize() * srcBuffer.length();
        Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), size);
        dstPoint.tickHostWrite();
    } else if (dstBuffer instanceof CompressedDataBuffer && srcBuffer instanceof CompressedDataBuffer) {
        // both buffers are compressed, just fire memcpy
        Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), srcBuffer.length() * srcBuffer.getElementSize());
    } else {
        // both buffers are NOT compressed
        AtomicAllocator.getInstance().memcpy(dstBuffer, srcBuffer);
    }
}
Also used : CompressedDataBuffer(org.nd4j.linalg.compression.CompressedDataBuffer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) Pointer(org.bytedeco.javacpp.Pointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 39 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class CudaFullCachingProvider method malloc.

/**
 * This method provides PointersPair to memory chunk specified by AllocationShape
 *
 * PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
 *
 * @param shape shape of desired memory chunk
 * @param point target AllocationPoint structure
 * @param location either HOST or DEVICE
 * @return
 */
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
    long reqMemory = AllocationUtils.getRequiredMemory(shape);
    if (location == AllocationStatus.DEVICE && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceAllocation()) {
        int deviceId = AtomicAllocator.getInstance().getDeviceId();
        ensureDeviceCacheHolder(deviceId, shape);
        CacheHolder cache = deviceCache.get(deviceId).get(shape);
        if (cache != null) {
            Pointer pointer = cache.poll();
            if (pointer != null) {
                cacheDeviceHit.incrementAndGet();
                deviceCachedAmount.get(deviceId).addAndGet(-1 * reqMemory);
                PointersPair pair = new PointersPair();
                pair.setDevicePointer(pointer);
                point.setAllocationStatus(AllocationStatus.DEVICE);
                point.setDeviceId(deviceId);
                return pair;
            }
        }
        cacheDeviceMiss.incrementAndGet();
        return super.malloc(shape, point, location);
    }
    return super.malloc(shape, point, location);
}
Also used : PointersPair(org.nd4j.jita.allocator.pointers.PointersPair) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 40 with Pointer

use of org.bytedeco.javacpp.Pointer in project nd4j by deeplearning4j.

the class JcublasLapack method sgeqrf.

// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
    INDArray a = A;
    INDArray r = R;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT getrf called in DOUBLE environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (R != null && R.ordering() == 'c')
        r = R.dup('f');
    INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new IllegalStateException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        CublasPointer xTauPointer = new CublasPointer(tau, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual QR decomp
        stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf failed", stat);
        }
        allocator.registerAction(ctx, a);
        // allocator.registerAction(ctx, tau);
        allocator.registerAction(ctx, INFO);
        if (INFO.getInt(0) != 0) {
            throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
        }
        // Copy R ( upper part of Q ) into result
        if (r != null) {
            r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
            INDArrayIndex[] ix = new INDArrayIndex[2];
            for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
                ix[0] = NDArrayIndex.point(i);
                ix[1] = NDArrayIndex.interval(0, i);
                r.put(ix, 0);
            }
        }
        stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
        worksize = worksizeBuffer.getInt(0);
        workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSorgqr failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (r != null && r != R)
        R.assign(r);
    log.info("A: {}", A);
    if (R != null)
        log.info("R: {}", R);
}
Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

Pointer (org.bytedeco.javacpp.Pointer)61 FloatPointer (org.bytedeco.javacpp.FloatPointer)29 DoublePointer (org.bytedeco.javacpp.DoublePointer)27 IntPointer (org.bytedeco.javacpp.IntPointer)23 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)23 INDArray (org.nd4j.linalg.api.ndarray.INDArray)21 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)19 BytePointer (org.bytedeco.javacpp.BytePointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 ShortPointer (org.bytedeco.javacpp.ShortPointer)16 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)16 PointerPointer (org.bytedeco.javacpp.PointerPointer)11 ByteBuffer (java.nio.ByteBuffer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 FunctionPointer (org.bytedeco.javacpp.FunctionPointer)9 BoolPointer (org.bytedeco.javacpp.BoolPointer)8 CLongPointer (org.bytedeco.javacpp.CLongPointer)8 CharPointer (org.bytedeco.javacpp.CharPointer)8