Search in sources :

Example 6 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaExecutioner method getEnvironmentInformation.

/**
 * This method return set of key/value
 * and key/key/value objects,
 * describing current environment
 *
 * @return
 */
@Override
public synchronized Properties getEnvironmentInformation() {
    if (properties == null) {
        Properties props = super.getEnvironmentInformation();
        List<Map<String, Object>> devicesList = new ArrayList<>();
        for (int i = 0; i < nativeOps.getAvailableDevices(); i++) {
            Map<String, Object> deviceProps = new HashMap<>();
            CudaPointer devPtr = new CudaPointer(i);
            deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_NAME_KEY, nativeOps.getDeviceName(devPtr));
            deviceProps.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr));
            deviceProps.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr));
            deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MAJOR_VERSION_KEY, (long) nativeOps.getDeviceMajor(devPtr));
            deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MINOR_VERSION_KEY, (long) nativeOps.getDeviceMinor(devPtr));
            devicesList.add(i, deviceProps);
        }
        props.put(Nd4jEnvironment.BACKEND_KEY, "CUDA");
        props.put(Nd4jEnvironment.CUDA_NUM_GPUS_KEY, nativeOps.getAvailableDevices());
        props.put(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY, devicesList);
        props.put(Nd4jEnvironment.BLAS_VENDOR_KEY, (Nd4j.factory().blas()).getBlasVendor().toString());
        props.put(Nd4jEnvironment.HOST_FREE_MEMORY_KEY, Pointer.maxBytes() - Pointer.totalBytes());
        properties = props;
    } else {
        List<Map<String, Object>> devicesList = (List<Map<String, Object>>) properties.get(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY);
        for (int i = 0; i < nativeOps.getAvailableDevices(); i++) {
            Map<String, Object> dev = devicesList.get(i);
            CudaPointer devPtr = new CudaPointer(i);
            dev.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr));
            dev.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr));
        }
        properties.put(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY, devicesList);
        properties.put(Nd4jEnvironment.HOST_FREE_MEMORY_KEY, Pointer.maxBytes() - Pointer.totalBytes());
    }
    return properties;
}
Also used : AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 7 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaExecutioner method exec.

@Override
public void exec(Aggregate op) {
    int numArguments = op.getArguments().size();
    int numShapeArguments = op.getShapes().size();
    int numIndexArguments = op.getIndexingArguments().size();
    int numIntArrays = op.getIntArrayArguments().size();
    int numRealArguments = op.getRealArguments().size();
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    PointerPointer extraArgs = new PointerPointer(32);
    extraArgs.put(0, null);
    extraArgs.put(1, context.getOldStream());
    extraArgs.put(2, new CudaPointer(1));
    extraArgs.put(3, new CudaPointer(op.getThreadsPerInstance()));
    extraArgs.put(4, new CudaPointer(op.getSharedMemorySize()));
    long[] arguments = new long[numArguments];
    for (int x = 0; x < numArguments; x++) {
        arguments[x] = op.getArguments().get(x) == null ? 0 : AtomicAllocator.getInstance().getPointer(op.getArguments().get(x), context).address();
        if (op.getArguments().get(x) != null)
            AtomicAllocator.getInstance().getAllocationPoint(op.getArguments().get(x)).tickDeviceWrite();
    }
    DataBuffer tempX = AllocationUtils.getPointersBuffer(arguments);
    PointerPointer xPtr = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
    long[] shapes = new long[numShapeArguments];
    for (int x = 0; x < numShapeArguments; x++) {
        shapes[x] = op.getShapes().get(x) == null ? 0 : AtomicAllocator.getInstance().getPointer(op.getShapes().get(x), context).address();
        if (op.getShapes().get(x) != null)
            AtomicAllocator.getInstance().getAllocationPoint(op.getShapes().get(x)).tickDeviceWrite();
    }
    DataBuffer tempS = AllocationUtils.getPointersBuffer(shapes);
    PointerPointer sPtr = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempS, context));
    long[] ints = new long[numIntArrays];
    for (int x = 0; x < numIntArrays; x++) {
        if (op.getIntArrayArguments().get(x) != null) {
            DataBuffer intBuf = Nd4j.getDataBufferFactory().createInt(op.getIntArrayArguments().get(x));
            ints[x] = AtomicAllocator.getInstance().getPointer(intBuf, context).address();
        }
    }
    DataBuffer tempI = AllocationUtils.getPointersBuffer(ints);
    PointerPointer iPtr = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempI, context));
    int[] indexes = new int[numIndexArguments];
    for (int x = 0; x < numIndexArguments; x++) {
        indexes[x] = op.getIndexingArguments().get(x);
    }
    DataBuffer intBuffer = Nd4j.getDataBufferFactory().createInt(indexes);
    double[] reals = new double[numRealArguments];
    for (int x = 0; x < numRealArguments; x++) {
        reals[x] = op.getRealArguments().get(x).doubleValue();
    }
    INDArray realsBuffer = Nd4j.create(reals);
    if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.execAggregateFloat(extraArgs, op.opNum(), xPtr, numArguments, sPtr, numShapeArguments, (IntPointer) AtomicAllocator.getInstance().getPointer(intBuffer, context), numIndexArguments, iPtr, numIntArrays, (FloatPointer) AtomicAllocator.getInstance().getPointer(realsBuffer.data(), context), numRealArguments);
    } else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.execAggregateDouble(extraArgs, op.opNum(), xPtr, numArguments, sPtr, numShapeArguments, (IntPointer) AtomicAllocator.getInstance().getPointer(intBuffer, context), numIndexArguments, iPtr, numIntArrays, (DoublePointer) AtomicAllocator.getInstance().getPointer(realsBuffer.data(), context), numRealArguments);
    } else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
        nativeOps.execAggregateHalf(extraArgs, op.opNum(), xPtr, numArguments, sPtr, numShapeArguments, (IntPointer) AtomicAllocator.getInstance().getPointer(intBuffer, context), numIndexArguments, iPtr, numIntArrays, (ShortPointer) AtomicAllocator.getInstance().getPointer(realsBuffer.data(), context), numRealArguments);
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) BaseDataBuffer(org.nd4j.linalg.api.buffer.BaseDataBuffer)

Example 8 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class LimitedContextPool method fillPoolWithResources.

protected synchronized void fillPoolWithResources(int numResources, boolean restoreDevice) {
    List<Integer> devices = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices();
    int cDevice = 0;
    if (restoreDevice) {
        cDevice = AtomicAllocator.getInstance().getDeviceId();
    }
    NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps();
    for (Integer device : devices) {
        nativeOps.setDevice(new CudaPointer(device));
        pool.put(device, new LinkedBlockingQueue<CudaContext>());
        cublasHandle_t handle = createNewCublasHandle();
        cusolverDnHandle_t solverHandle = createNewSolverHandle();
        for (int cnt = 0; cnt < numResources; cnt++) {
            CudaContext context = createNewStream(device);
            context.initOldStream();
            getDeviceBuffers(context, device);
            context.setHandle(handle);
            context.setSolverHandle(solverHandle);
            context.syncOldStream();
            pool.get(device).add(context);
        }
    }
    if (restoreDevice) {
        nativeOps.setDevice(new CudaPointer(cDevice));
    }
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) NativeOps(org.nd4j.nativeblas.NativeOps) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t(org.nd4j.jita.allocator.pointers.cuda.cublasHandle_t) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 9 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class BasicContextPool method createNewStream.

protected CudaContext createNewStream(Integer deviceId) {
    log.debug("Creating new stream for thread: [{}], device: [{}]...", Thread.currentThread().getId(), deviceId);
    // JCuda.cudaSetDevice(deviceId);
    nativeOps.setDevice(new CudaPointer(deviceId));
    CudaContext context = new CudaContext();
    context.initOldStream();
    return context;
}
Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 10 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaCachingZeroProvider method malloc.

/**
 * This method provides PointersPair to memory chunk specified by AllocationShape
 *
 * PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
 *
 * @param shape shape of desired memory chunk
 * @param point target AllocationPoint structure
 * @param location either HOST or DEVICE
 * @return
 */
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
    long reqMemory = AllocationUtils.getRequiredMemory(shape);
    if (location == AllocationStatus.HOST && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength()) {
        CacheHolder cache = zeroCache.get(shape);
        if (cache != null) {
            Pointer pointer = cache.poll();
            if (pointer != null) {
                cacheZeroHit.incrementAndGet();
                // since this memory chunk is going to be used now, remove it's amount from
                zeroCachedAmount.addAndGet(-1 * reqMemory);
                PointersPair pair = new PointersPair();
                pair.setDevicePointer(new CudaPointer(pointer.address()));
                pair.setHostPointer(new CudaPointer(pointer.address()));
                point.setAllocationStatus(AllocationStatus.HOST);
                return pair;
            }
        }
        cacheZeroMiss.incrementAndGet();
        if (CudaEnvironment.getInstance().getConfiguration().isUsePreallocation() && zeroCachedAmount.get() < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache() / 10 && reqMemory < 16 * 1024 * 1024L) {
            CachePreallocator preallocator = new CachePreallocator(shape, location, CudaEnvironment.getInstance().getConfiguration().getPreallocationCalls());
            preallocator.start();
        }
        cacheZeroMiss.incrementAndGet();
        return super.malloc(shape, point, location);
    }
    return super.malloc(shape, point, location);
}
Also used : PointersPair(org.nd4j.jita.allocator.pointers.PointersPair) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Aggregations

CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)47 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)27 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)20 Pointer (org.bytedeco.javacpp.Pointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)12 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)10 FloatPointer (org.bytedeco.javacpp.FloatPointer)10 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)4 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4