Search in sources :

Example 41 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class CudaMemoryManager method collect.

/**
 * This method detaches off-heap memory from passed INDArray instances, and optionally stores them in cache for future reuse
 * PLEASE NOTE: Cache options depend on specific implementations
 *
 * @param arrays
 */
@Override
public void collect(INDArray... arrays) {
    // we basically want to free memory, without touching INDArray itself.
    // so we don't care when gc is going to release object: memory is already cached
    Nd4j.getExecutioner().commit();
    int cnt = -1;
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    for (INDArray array : arrays) {
        cnt++;
        // we don't collect views, since they don't have their own memory
        if (array == null || array.isView())
            continue;
        AllocationPoint point = allocator.getAllocationPoint(array);
        if (point.getAllocationStatus() == AllocationStatus.HOST)
            allocator.getMemoryHandler().free(point, AllocationStatus.HOST);
        else if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
            allocator.getMemoryHandler().free(point, AllocationStatus.DEVICE);
            allocator.getMemoryHandler().free(point, AllocationStatus.HOST);
        } else if (point.getAllocationStatus() == AllocationStatus.DEALLOCATED) {
        // do nothing
        } else
            throw new RuntimeException("Unknown AllocationStatus: " + point.getAllocationStatus() + " for argument: " + cnt);
        point.setAllocationStatus(AllocationStatus.DEALLOCATED);
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 42 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class CudaMemoryManager method memcpy.

/**
 * This method provides basic memcpy functionality with respect to target environment
 *
 * @param dstBuffer
 * @param srcBuffer
 */
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    if (dstBuffer instanceof CompressedDataBuffer && !(srcBuffer instanceof CompressedDataBuffer)) {
        // destination is compressed, source isn't
        AllocationPoint srcPoint = AtomicAllocator.getInstance().getAllocationPoint(srcBuffer);
        long size = srcBuffer.getElementSize() * srcBuffer.length();
        if (!srcPoint.isActualOnHostSide()) {
            // copying device -> host
            AtomicAllocator.getInstance().synchronizeHostData(srcBuffer);
        // Pointer src = AtomicAllocator.getInstance().getPointer(srcBuffer, context);
        // NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstBuffer.addressPointer(), src, size, 2, context.getSpecialStream());
        // context.syncSpecialStream();
        }
        // else {
        // copying host -> host
        Pointer src = AtomicAllocator.getInstance().getHostPointer(srcBuffer);
        Pointer.memcpy(dstBuffer.addressPointer(), src, size);
    // }
    } else if (!(dstBuffer instanceof CompressedDataBuffer) && srcBuffer instanceof CompressedDataBuffer) {
        // destination is NOT compressed, source is compressed
        AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(dstBuffer);
        long size = srcBuffer.getElementSize() * srcBuffer.length();
        Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), size);
        dstPoint.tickHostWrite();
    } else if (dstBuffer instanceof CompressedDataBuffer && srcBuffer instanceof CompressedDataBuffer) {
        // both buffers are compressed, just fire memcpy
        Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), srcBuffer.length() * srcBuffer.getElementSize());
    } else {
        // both buffers are NOT compressed
        AtomicAllocator.getInstance().memcpy(dstBuffer, srcBuffer);
    }
}
Also used : CompressedDataBuffer(org.nd4j.linalg.compression.CompressedDataBuffer) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) Pointer(org.bytedeco.javacpp.Pointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 43 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class CudaFullCachingProvider method malloc.

/**
 * This method provides PointersPair to memory chunk specified by AllocationShape
 *
 * PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
 *
 * @param shape shape of desired memory chunk
 * @param point target AllocationPoint structure
 * @param location either HOST or DEVICE
 * @return
 */
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
    long reqMemory = AllocationUtils.getRequiredMemory(shape);
    if (location == AllocationStatus.DEVICE && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceAllocation()) {
        int deviceId = AtomicAllocator.getInstance().getDeviceId();
        ensureDeviceCacheHolder(deviceId, shape);
        CacheHolder cache = deviceCache.get(deviceId).get(shape);
        if (cache != null) {
            Pointer pointer = cache.poll();
            if (pointer != null) {
                cacheDeviceHit.incrementAndGet();
                deviceCachedAmount.get(deviceId).addAndGet(-1 * reqMemory);
                PointersPair pair = new PointersPair();
                pair.setDevicePointer(pointer);
                point.setAllocationStatus(AllocationStatus.DEVICE);
                point.setDeviceId(deviceId);
                return pair;
            }
        }
        cacheDeviceMiss.incrementAndGet();
        return super.malloc(shape, point, location);
    }
    return super.malloc(shape, point, location);
}
Also used : PointersPair(org.nd4j.jita.allocator.pointers.PointersPair) Pointer(org.bytedeco.javacpp.Pointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 44 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method accumulate.

public INDArray accumulate(INDArray target, INDArray... arrays) {
    if (arrays == null || arrays.length == 0)
        throw new RuntimeException("Input arrays are missing");
    if (arrays.length == 1)
        return target.assign(arrays[0]);
    // we do averaging on GPU only if ALL devices have p2p links
    if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) {
        Nd4j.getExecutioner().push();
        long len = target.lengthLong();
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
        Pointer z = AtomicAllocator.getInstance().getPointer(target, context);
        long[] xPointers = new long[arrays.length];
        for (int i = 0; i < arrays.length; i++) {
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
            xPointers[i] = point.getPointers().getDevicePointer().address();
            point.tickDeviceWrite();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
        allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
        PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len);
        } else {
            nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len);
        }
        allocator.getFlowController().registerAction(context, target, arrays);
        tempX.address();
        return target;
    } else {
        long len = target.lengthLong();
        Nd4j.getExecutioner().commit();
        CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer(arrays.length);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
        for (int i = 0; i < arrays.length; i++) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else {
            nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        }
        AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        return target;
    }
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 45 with AllocationPoint

use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method specialConcat.

@Override
public INDArray specialConcat(int dimension, INDArray... toConcat) {
    if (toConcat.length == 1)
        return toConcat[0];
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    PointerPointer shapeInfoPointers = new PointerPointer(toConcat.length);
    PointerPointer dataPointers = new PointerPointer(toConcat.length);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
    int sumAlongDim = 0;
    int[] outputShape = ArrayUtil.copy(toConcat[0].shape());
    for (int i = 0; i < toConcat.length; i++) {
        if (toConcat[i].isCompressed())
            Nd4j.getCompressor().decompressi(toConcat[i]);
        allocator.synchronizeHostData(toConcat[i]);
        shapeInfoPointers.put(i, allocator.getHostPointer(toConcat[i].shapeInfoDataBuffer()));
        dataPointers.put(i, allocator.getHostPointer(toConcat[i].data()));
        sumAlongDim += toConcat[i].size(dimension);
        for (int j = 0; j < toConcat[i].rank(); j++) if (j != dimension && toConcat[i].size(j) != outputShape[j]) {
            throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
        }
    }
    outputShape[dimension] = sumAlongDim;
    PointerPointer dummy = new PointerPointer(new Pointer[] { null });
    INDArray ret = Nd4j.createUninitialized(outputShape, Nd4j.order());
    if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.specialConcatDouble(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (DoublePointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.specialConcatFloat(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (FloatPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else if (ret.data().dataType() == DataBuffer.Type.HALF) {
        nativeOps.specialConcatHalf(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (ShortPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else {
        throw new ND4JIllegalStateException("Unknown dataType: " + ret.data().dataType());
    }
    AllocationPoint point = allocator.getAllocationPoint(ret);
    nativeOps.memcpyAsync(point.getDevicePointer(), point.getHostPointer(), ret.lengthLong() * Nd4j.sizeOfDataType(ret.data().dataType()), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream());
    context.getSpecialStream().synchronize();
    point.tickHostRead();
    point.tickDeviceWrite();
    return ret;
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException)

Aggregations

AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)67 INDArray (org.nd4j.linalg.api.ndarray.INDArray)33 Test (org.junit.Test)31 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)24 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)15 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)11 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)11 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)7 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 Pointer (org.bytedeco.javacpp.Pointer)6 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)5 PointersPair (org.nd4j.jita.allocator.pointers.PointersPair)5 MemoryWorkspace (org.nd4j.linalg.api.memory.MemoryWorkspace)4 JCublasNDArray (org.nd4j.linalg.jcublas.JCublasNDArray)3 CudaDoubleDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)3 CompressedDataBuffer (org.nd4j.linalg.compression.CompressedDataBuffer)2 DeviceLocalNDArray (org.nd4j.linalg.util.DeviceLocalNDArray)2 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 FileInputStream (java.io.FileInputStream)1