Examples with CudaPointer - org.nd4j.jita.allocator.pointers.CudaPointer

Example 11 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaFullCachingProvider method free.

/**
 * This method frees specific chunk of memory, described by AllocationPoint passed in
 *
 * PLEASE NOTE: This method can actually ignore free, and keep released memory chunk for future reuse.
 *
 * @param point
 */
@Override
public void free(AllocationPoint point) {
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        if (point.isConstant())
            return;
        AllocationShape shape = point.getShape();
        int deviceId = point.getDeviceId();
        long address = point.getDevicePointer().address();
        long reqMemory = AllocationUtils.getRequiredMemory(shape);
        if (reqMemory > CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceCacheableLength() || deviceCachedAmount.get(deviceId).get() >= CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache()) {
            // log.info("DEVICE_{} memory purging: {} bytes; MS: {}; MT: {}", deviceId, reqMemory, MAX_GPU_ALLOCATION, MAX_GPU_CACHE);
            super.free(point);
            return;
        }
        // log.info("Saving HOST memory into cache...");
        ensureDeviceCacheHolder(deviceId, shape);
        CacheHolder cache = deviceCache.get(deviceId).get(shape);
        if (point.getDeviceId() != deviceId)
            throw new RuntimeException("deviceId changed!");
        // memory chunks < threshold will be cached no matter what
        if (reqMemory <= FORCED_CACHE_THRESHOLD) {
            cache.put(new CudaPointer(point.getDevicePointer().address()));
            return;
        } else {
            long cacheEntries = cache.size();
            long cacheHeight = deviceCache.get(deviceId).size();
            // total memory allocated within this bucket
            long cacheDepth = cacheEntries * reqMemory;
            // if (cacheDepth < MAX_CACHED_MEMORY / cacheHeight) {
            cache.put(new CudaPointer(point.getDevicePointer().address()));
            return;
        // } else {
        // super.free(point);
        // }
        }
    }
    super.free(point);
}

Also used : AllocationShape(org.nd4j.jita.allocator.impl.AllocationShape) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 12 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method toFlattened.

@Override
public INDArray toFlattened(char order, Collection<INDArray> matrices) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    int length = 0;
    for (INDArray m : matrices) length += m.length();
    INDArray ret = Nd4j.create(new int[] { 1, length }, order);
    int linearIndex = 0;
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    for (INDArray m : matrices) {
        CudaContext context = allocator.getFlowController().prepareAction(ret, m);
        if (m.ordering() == order && ret.elementWiseStride() == m.elementWiseStride() && ret.elementWiseStride() == 1) {
            // do memcpy in proper direction and forget about that
            allocator.memcpyAsync(ret.data(), new CudaPointer(allocator.getHostPointer(m).address()), AllocationUtils.getRequiredMemory(AllocationUtils.buildAllocationShape(m)), linearIndex * (m.data().dataType() == DataBuffer.Type.DOUBLE ? 8 : m.data().dataType() == DataBuffer.Type.FLOAT ? 4 : 2));
            linearIndex += m.length();
        } else {
            Pointer hostYShapeInfo = AddressRetriever.retrieveHostPointer(m.shapeInfoDataBuffer());
            PointerPointer extras = new PointerPointer(AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), hostYShapeInfo, AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()));
            if (m.data().dataType() == DataBuffer.Type.DOUBLE) {
                nativeOps.flattenDouble(extras, linearIndex, order, (DoublePointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (DoublePointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
            } else if (m.data().dataType() == DataBuffer.Type.FLOAT) {
                nativeOps.flattenFloat(extras, linearIndex, order, (FloatPointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (FloatPointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
            } else {
                nativeOps.flattenHalf(extras, linearIndex, order, (ShortPointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (ShortPointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
            }
            // Works for all cases...
            /* NdIndexIterator iter = new NdIndexIterator(order, m.shape());
                while (iter.hasNext()) {
                    ret.putScalar(linearIndex++, m.getDouble(iter.next()));
                }*/
            linearIndex += m.length();
        }
        if (ret != null)
            allocator.registerAction(context, ret, m);
    }
    return ret;
}

Also used : GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 13 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method sort.

@Override
public INDArray sort(INDArray x, boolean descending) {
    if (x.isScalar())
        return x;
    Nd4j.getExecutioner().push();
    CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(x);
    Pointer ptr = AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer());
    PointerPointer extraz = new // 0
    PointerPointer(// 0
    ptr, // 1
    context.getOldStream(), // 2
    AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
    context.getBufferAllocation(), // 4
    context.getBufferReduction(), // 5
    context.getBufferScalar(), // 6
    context.getBufferSpecial(), // 7
    ptr, // 8
    AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer()), // 9
    ptr, // 10
    ptr, // 11
    ptr, // 12
    ptr, // 13
    ptr, // 14
    ptr, // special pointer for IsMax  // 15
    ptr, // special pointer for IsMax  // 16
    ptr, // special pointer for IsMax // 17
    ptr, new CudaPointer(0));
    // we're sending > 10m elements to radixSort
    boolean isRadix = !x.isView() && (x.lengthLong() > 1024 * 1024 * 10);
    INDArray tmpX = x;
    // we need to guarantee all threads are finished here
    if (isRadix)
        Nd4j.getExecutioner().commit();
    if (x.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.sortFloat(extraz, (FloatPointer) AtomicAllocator.getInstance().getPointer(tmpX, context), (IntPointer) AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
    } else if (x.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.sortDouble(extraz, (DoublePointer) AtomicAllocator.getInstance().getPointer(tmpX, context), (IntPointer) AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
    } else if (x.data().dataType() == DataBuffer.Type.HALF) {
        nativeOps.sortHalf(extraz, (ShortPointer) AtomicAllocator.getInstance().getPointer(tmpX, context), (IntPointer) AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
    } else {
        throw new UnsupportedOperationException("Unknown dataType " + x.data().dataType());
    }
    AtomicAllocator.getInstance().getFlowController().registerAction(context, x);
    return x;
}

Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 14 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method average.

@Override
public INDArray average(INDArray target, INDArray[] arrays) {
    if (arrays == null || arrays.length == 0)
        throw new RuntimeException("Input arrays are missing");
    if (arrays.length == 1)
        return target.assign(arrays[0]);
    // we do averaging on GPU only if ALL devices have p2p links
    if (nativeOps.isP2PAvailable() && CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed()) {
        Nd4j.getExecutioner().push();
        long len = target != null ? target.lengthLong() : arrays[0].lengthLong();
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
        Pointer z = target == null ? null : AtomicAllocator.getInstance().getPointer(target, context);
        long[] xPointers = new long[arrays.length];
        for (int i = 0; i < arrays.length; i++) {
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
            xPointers[i] = point.getPointers().getDevicePointer().address();
            point.tickDeviceWrite();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
        allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
        PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
        if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.averageDouble(extras, x, target == null ? null : (DoublePointer) z, arrays.length, len, true);
        } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.averageFloat(extras, x, target == null ? null : (FloatPointer) z, arrays.length, len, true);
        } else {
            nativeOps.averageHalf(extras, x, target == null ? null : (ShortPointer) z, arrays.length, len, true);
        }
        allocator.getFlowController().registerAction(context, target, arrays);
        tempX.address();
        return target;
    } else {
        // otherwise we do averging on CPU side
        /**
         * We expect all operations are complete at this point
         */
        long len = target == null ? arrays[0].lengthLong() : target.lengthLong();
        CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer(arrays.length);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
        for (int i = 0; i < arrays.length; i++) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.averageDouble(extras, dataPointers, target == null ? null : (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.averageFloat(extras, dataPointers, target == null ? null : (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        } else {
            nativeOps.averageHalf(extras, dataPointers, target == null ? null : (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        }
        if (target != null)
            AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        // TODO: make propagation optional maybe?
        if (true) {
            for (int i = 0; i < arrays.length; i++) {
                AtomicAllocator.getInstance().getAllocationPoint(arrays[i]).tickHostWrite();
            }
        }
        return target;
    }
}

Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 15 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class JcublasLapack method dsyev.

public int dsyev(char _jobz, char _uplo, int N, INDArray A, INDArray R) {
    int status = -1;
    int jobz = _jobz == 'V' ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
    int uplo = _uplo == 'L' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
    if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
        log.warn("DOUBLE dsyev called in FLOAT environment");
    INDArray a = A;
    if (A.ordering() == 'c')
        a = A.dup('f');
    int M = A.rows();
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        status = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (status == 0) {
            // transfer the INDArray into GPU memory
            CublasPointer xAPointer = new CublasPointer(a, ctx);
            CublasPointer xRPointer = new CublasPointer(R, ctx);
            // this output - indicates how much memory we'll need for the real operation
            DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
            status = cusolverDnDsyevd_bufferSize(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
            if (status == CUSOLVER_STATUS_SUCCESS) {
                int worksize = worksizeBuffer.getInt(0);
                // allocate memory for the workspace, the non-converging row buffer and a return code
                Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
                INDArray INFO = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createInt(1), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, 1 }));
                // Do the actual decomp
                status = cusolverDnDsyevd(solverDn, jobz, uplo, M, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xRPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
                allocator.registerAction(ctx, INFO);
                if (status == 0)
                    status = INFO.getInt(0);
            }
        }
    }
    if (status == 0) {
        allocator.registerAction(ctx, R);
        allocator.registerAction(ctx, a);
        if (a != A)
            A.assign(a);
    }
    return status;
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)47 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)27 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)20 Pointer (org.bytedeco.javacpp.Pointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)12 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)10 FloatPointer (org.bytedeco.javacpp.FloatPointer)10 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)4 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4