Search in sources :

Example 1 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method concat.

@Override
public INDArray concat(int dimension, INDArray... toConcat) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    if (toConcat.length == 1)
        return toConcat[0];
    int sumAlongDim = 0;
    for (int i = 0; i < toConcat.length; i++) {
        if (toConcat[i].isCompressed())
            Nd4j.getCompressor().decompressi(toConcat[i]);
        sumAlongDim += toConcat[i].size(dimension);
    }
    int[] outputShape = ArrayUtil.copy(toConcat[0].shape());
    outputShape[dimension] = sumAlongDim;
    INDArray ret = Nd4j.createUninitialized(outputShape, Nd4j.order());
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareAction(ret, toConcat);
    long[] shapeInfoPointers = new long[toConcat.length];
    long[] dataPointers = new long[toConcat.length];
    long[] tadPointers = new long[toConcat.length];
    long[] offsetsPointers = new long[toConcat.length];
    long[] hostShapeInfoPointers = new long[toConcat.length];
    TADManager tadManager = Nd4j.getExecutioner().getTADManager();
    for (int i = 0; i < toConcat.length; i++) {
        shapeInfoPointers[i] = AddressRetriever.retrieveDeviceAddress(toConcat[i].shapeInfoDataBuffer(), context);
        dataPointers[i] = AtomicAllocator.getInstance().getPointer(toConcat[i], context).address();
        hostShapeInfoPointers[i] = AtomicAllocator.getInstance().getHostPointer(toConcat[i].shapeInfoDataBuffer()).address();
        sumAlongDim += toConcat[i].size(dimension);
        for (int j = 0; j < toConcat[i].rank(); j++) if (j != dimension && toConcat[i].size(j) != outputShape[j]) {
            throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
        }
        Pair<DataBuffer, DataBuffer> tadBuffers = tadManager.getTADOnlyShapeInfo(toConcat[i], new int[] { dimension });
        long devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context).address();
        DataBuffer offsets = tadBuffers.getSecond();
        long devTadOffsets = AtomicAllocator.getInstance().getPointer(offsets, context).address();
        tadPointers[i] = devTadShapeInfo;
        offsetsPointers[i] = devTadOffsets;
    }
    // getting tadOnlyShape for result
    Pair<DataBuffer, DataBuffer> zBuffers = tadManager.getTADOnlyShapeInfo(ret, new int[] { dimension });
    // System.out.println("shapePointers: " + Arrays.toString(shapeInfoPointers));
    Pointer dZ = AtomicAllocator.getInstance().getPointer(ret, context);
    Pointer dZShapeInfo = AddressRetriever.retrieveDevicePointer(ret.shapeInfoDataBuffer(), context);
    CudaDoubleDataBuffer tempData = new CudaDoubleDataBuffer(toConcat.length);
    CudaDoubleDataBuffer tempShapes = new CudaDoubleDataBuffer(toConcat.length);
    CudaDoubleDataBuffer tempTAD = new CudaDoubleDataBuffer(toConcat.length);
    CudaDoubleDataBuffer tempOffsets = new CudaDoubleDataBuffer(toConcat.length);
    AtomicAllocator.getInstance().memcpyBlocking(tempData, new LongPointer(dataPointers), dataPointers.length * 8, 0);
    AtomicAllocator.getInstance().memcpyBlocking(tempShapes, new LongPointer(shapeInfoPointers), shapeInfoPointers.length * 8, 0);
    AtomicAllocator.getInstance().memcpyBlocking(tempTAD, new LongPointer(tadPointers), tadPointers.length * 8, 0);
    AtomicAllocator.getInstance().memcpyBlocking(tempOffsets, new LongPointer(offsetsPointers), offsetsPointers.length * 8, 0);
    Pointer dataPointer = AtomicAllocator.getInstance().getPointer(tempData, context);
    Pointer shapesPointer = AtomicAllocator.getInstance().getPointer(tempShapes, context);
    Pointer tadPointer = AtomicAllocator.getInstance().getPointer(tempTAD, context);
    Pointer offsetPointer = AtomicAllocator.getInstance().getPointer(tempOffsets, context);
    // System.out.println("ShapesPointer after conversion: " + shapesPointer);
    PointerPointer extras = new PointerPointer(AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), AddressRetriever.retrieveHostPointer(toConcat[0].shapeInfoDataBuffer()), AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), new LongPointer(hostShapeInfoPointers), // getting zTADShape
    AtomicAllocator.getInstance().getPointer(zBuffers.getFirst(), context), // getting zOffset
    AtomicAllocator.getInstance().getPointer(zBuffers.getSecond(), context));
    if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.concatDouble(extras, dimension, toConcat.length, new PointerPointer(new Pointer[] { dataPointer }), new PointerPointer(new Pointer[] { shapesPointer }), (DoublePointer) dZ, (IntPointer) dZShapeInfo, new PointerPointer(new Pointer[] { tadPointer }), new PointerPointer(new Pointer[] { offsetPointer }));
    } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.concatFloat(extras, dimension, toConcat.length, new PointerPointer(new Pointer[] { dataPointer }), new PointerPointer(new Pointer[] { shapesPointer }), (FloatPointer) dZ, (IntPointer) dZShapeInfo, new PointerPointer(new Pointer[] { tadPointer }), new PointerPointer(new Pointer[] { offsetPointer }));
    } else {
        nativeOps.concatHalf(extras, dimension, toConcat.length, new PointerPointer(new Pointer[] { dataPointer }), new PointerPointer(new Pointer[] { shapesPointer }), (ShortPointer) dZ, (IntPointer) dZShapeInfo, new PointerPointer(new Pointer[] { tadPointer }), new PointerPointer(new Pointer[] { offsetPointer }));
    }
    allocator.registerAction(context, ret, toConcat);
    return ret;
// return super.concat(dimension, toConcat);
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) TADManager(org.nd4j.linalg.cache.TADManager) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) CompressedDataBuffer(org.nd4j.linalg.compression.CompressedDataBuffer) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)

Example 2 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method toFlattened.

@Override
public INDArray toFlattened(char order, Collection<INDArray> matrices) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    int length = 0;
    for (INDArray m : matrices) length += m.length();
    INDArray ret = Nd4j.create(new int[] { 1, length }, order);
    int linearIndex = 0;
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    for (INDArray m : matrices) {
        CudaContext context = allocator.getFlowController().prepareAction(ret, m);
        if (m.ordering() == order && ret.elementWiseStride() == m.elementWiseStride() && ret.elementWiseStride() == 1) {
            // do memcpy in proper direction and forget about that
            allocator.memcpyAsync(ret.data(), new CudaPointer(allocator.getHostPointer(m).address()), AllocationUtils.getRequiredMemory(AllocationUtils.buildAllocationShape(m)), linearIndex * (m.data().dataType() == DataBuffer.Type.DOUBLE ? 8 : m.data().dataType() == DataBuffer.Type.FLOAT ? 4 : 2));
            linearIndex += m.length();
        } else {
            Pointer hostYShapeInfo = AddressRetriever.retrieveHostPointer(m.shapeInfoDataBuffer());
            PointerPointer extras = new PointerPointer(AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), hostYShapeInfo, AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()));
            if (m.data().dataType() == DataBuffer.Type.DOUBLE) {
                nativeOps.flattenDouble(extras, linearIndex, order, (DoublePointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (DoublePointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
            } else if (m.data().dataType() == DataBuffer.Type.FLOAT) {
                nativeOps.flattenFloat(extras, linearIndex, order, (FloatPointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (FloatPointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
            } else {
                nativeOps.flattenHalf(extras, linearIndex, order, (ShortPointer) allocator.getPointer(ret, context), (IntPointer) allocator.getPointer(ret.shapeInfoDataBuffer(), context), (ShortPointer) allocator.getPointer(m, context), (IntPointer) allocator.getPointer(m.shapeInfoDataBuffer(), context));
            }
            // Works for all cases...
            /* NdIndexIterator iter = new NdIndexIterator(order, m.shape());
                while (iter.hasNext()) {
                    ret.putScalar(linearIndex++, m.getDouble(iter.next()));
                }*/
            linearIndex += m.length();
        }
        if (ret != null)
            allocator.registerAction(context, ret, m);
    }
    return ret;
}
Also used : GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 3 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method average.

@Override
public INDArray average(INDArray target, INDArray[] arrays) {
    if (arrays == null || arrays.length == 0)
        throw new RuntimeException("Input arrays are missing");
    if (arrays.length == 1)
        return target.assign(arrays[0]);
    // we do averaging on GPU only if ALL devices have p2p links
    if (nativeOps.isP2PAvailable() && CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed()) {
        Nd4j.getExecutioner().push();
        long len = target != null ? target.lengthLong() : arrays[0].lengthLong();
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
        Pointer z = target == null ? null : AtomicAllocator.getInstance().getPointer(target, context);
        long[] xPointers = new long[arrays.length];
        for (int i = 0; i < arrays.length; i++) {
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
            xPointers[i] = point.getPointers().getDevicePointer().address();
            point.tickDeviceWrite();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
        allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
        PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
        if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.averageDouble(extras, x, target == null ? null : (DoublePointer) z, arrays.length, len, true);
        } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.averageFloat(extras, x, target == null ? null : (FloatPointer) z, arrays.length, len, true);
        } else {
            nativeOps.averageHalf(extras, x, target == null ? null : (ShortPointer) z, arrays.length, len, true);
        }
        allocator.getFlowController().registerAction(context, target, arrays);
        tempX.address();
        return target;
    } else {
        // otherwise we do averging on CPU side
        /**
         * We expect all operations are complete at this point
         */
        long len = target == null ? arrays[0].lengthLong() : target.lengthLong();
        CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer(arrays.length);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
        for (int i = 0; i < arrays.length; i++) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.averageDouble(extras, dataPointers, target == null ? null : (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.averageFloat(extras, dataPointers, target == null ? null : (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        } else {
            nativeOps.averageHalf(extras, dataPointers, target == null ? null : (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        }
        if (target != null)
            AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        // TODO: make propagation optional maybe?
        if (true) {
            for (int i = 0; i < arrays.length; i++) {
                AtomicAllocator.getInstance().getAllocationPoint(arrays[i]).tickHostWrite();
            }
        }
        return target;
    }
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 4 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class DelayedMemoryTest method testDelayedAllocation1.

@Test
public void testDelayedAllocation1() throws Exception {
    final AtomicAllocator allocator = AtomicAllocator.getInstance();
    final int limit = 6;
    final INDArray[] arrays = new INDArray[limit];
    final Thread[] threads = new Thread[limit];
    final int[] cards = new int[limit];
    for (int c = 0; c < arrays.length; c++) {
        arrays[c] = Nd4j.create(new float[] { 1f, 2f, 3f, 4f, 5f });
        // we ensure, that both buffers are located in host memory now
        assertEquals(AllocationStatus.HOST, allocator.getAllocationPoint(arrays[c]).getAllocationStatus());
        assertEquals(AllocationStatus.HOST, allocator.getAllocationPoint(arrays[c].shapeInfoDataBuffer()).getAllocationStatus());
    }
    for (int c = 0; c < arrays.length; c++) {
        final int cnt = c;
        threads[cnt] = new Thread(new Runnable() {

            @Override
            public void run() {
                float sum = arrays[cnt].sumNumber().floatValue();
                cards[cnt] = allocator.getDeviceId();
                assertEquals("Failed on C: " + cnt, 15f, sum, 0.001f);
            }
        });
        threads[cnt].start();
    }
    for (int c = 0; c < arrays.length; c++) {
        threads[c].join();
    }
    // check if all devices present in system were used
    for (int c = 0; c < arrays.length; c++) {
        assertNotEquals(allocator.getAllocationPoint(arrays[c]).getPointers().getDevicePointer(), allocator.getAllocationPoint(arrays[c]).getPointers().getHostPointer());
        assertNotEquals(allocator.getAllocationPoint(arrays[c].shapeInfoDataBuffer()).getPointers().getDevicePointer(), allocator.getAllocationPoint(arrays[c].shapeInfoDataBuffer()).getPointers().getHostPointer());
    }
    int numDevices = Nd4j.getAffinityManager().getNumberOfDevices();
    for (int c = 0; c < numDevices; c++) {
        assertTrue("Failed to find device [" + c + "] in used devices", ArrayUtils.contains(cards, c));
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) Test(org.junit.Test)

Example 5 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class CudaMemoryManager method collect.

/**
 * This method detaches off-heap memory from passed INDArray instances, and optionally stores them in cache for future reuse
 * PLEASE NOTE: Cache options depend on specific implementations
 *
 * @param arrays
 */
@Override
public void collect(INDArray... arrays) {
    // we basically want to free memory, without touching INDArray itself.
    // so we don't care when gc is going to release object: memory is already cached
    Nd4j.getExecutioner().commit();
    int cnt = -1;
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    for (INDArray array : arrays) {
        cnt++;
        // we don't collect views, since they don't have their own memory
        if (array == null || array.isView())
            continue;
        AllocationPoint point = allocator.getAllocationPoint(array);
        if (point.getAllocationStatus() == AllocationStatus.HOST)
            allocator.getMemoryHandler().free(point, AllocationStatus.HOST);
        else if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
            allocator.getMemoryHandler().free(point, AllocationStatus.DEVICE);
            allocator.getMemoryHandler().free(point, AllocationStatus.HOST);
        } else if (point.getAllocationStatus() == AllocationStatus.DEALLOCATED) {
        // do nothing
        } else
            throw new RuntimeException("Unknown AllocationStatus: " + point.getAllocationStatus() + " for argument: " + cnt);
        point.setAllocationStatus(AllocationStatus.DEALLOCATED);
    }
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Aggregations

AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)14 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)11 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)10 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)8 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)6 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)5 CudaDoubleDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)5 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)4 Test (org.junit.Test)3 TADManager (org.nd4j.linalg.cache.TADManager)3 CompressedDataBuffer (org.nd4j.linalg.compression.CompressedDataBuffer)3 CudaIntDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer)3 ArrayList (java.util.ArrayList)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 PointersPair (org.nd4j.jita.allocator.pointers.PointersPair)1 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)1 PagedPointer (org.nd4j.linalg.api.memory.pointers.PagedPointer)1 GridPointers (org.nd4j.linalg.api.ops.grid.GridPointers)1 Pair (org.nd4j.linalg.primitives.Pair)1