Search in sources :

Example 11 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class CudaGridExecutioner method pointerizeOp.

/**
 * This method returns Op as set of required pointers for it
 * @param op
 * @param dimensions
 * @return
 */
protected GridPointers pointerizeOp(Op op, int... dimensions) {
    GridPointers pointers = new GridPointers(op, dimensions);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    // CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(op.z(), op.x(), op.y());
    // FIXME: do not leave it as is
    CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
    pointers.setX(allocator.getPointer(op.x(), context));
    pointers.setXShapeInfo(allocator.getPointer(op.x().shapeInfoDataBuffer(), context));
    pointers.setZ(allocator.getPointer(op.z(), context));
    pointers.setZShapeInfo(allocator.getPointer(op.z().shapeInfoDataBuffer(), context));
    pointers.setZLength(op.z().length());
    if (op.y() != null) {
        pointers.setY(allocator.getPointer(op.y(), context));
        pointers.setYShapeInfo(allocator.getPointer(op.y().shapeInfoDataBuffer(), context));
    }
    if (dimensions != null && dimensions.length > 0) {
        DataBuffer dimensionBuffer = Nd4j.getConstantHandler().getConstantBuffer(dimensions);
        pointers.setDimensions(allocator.getPointer(dimensionBuffer, context));
        pointers.setDimensionsLength(dimensions.length);
    }
    // we build TADs
    if (dimensions != null && dimensions.length > 0) {
        Pair<DataBuffer, DataBuffer> tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), dimensions);
        Pointer devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
        Pointer devTadOffsets = tadBuffers.getSecond() == null ? null : AtomicAllocator.getInstance().getPointer(tadBuffers.getSecond(), context);
        // we don't really care, if tadOffsets will be nulls
        pointers.setTadShape(devTadShapeInfo);
        pointers.setTadOffsets(devTadOffsets);
    }
    return pointers;
}
Also used : GridPointers(org.nd4j.linalg.api.ops.grid.GridPointers) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 12 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArray method unsafeDuplication.

@Override
public INDArray unsafeDuplication(boolean blocking) {
    DataBuffer rb = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createSame(this.data, false) : Nd4j.getDataBufferFactory().createSame(this.data, false, Nd4j.getMemoryManager().getCurrentWorkspace());
    INDArray ret = Nd4j.createArrayFromShapeBuffer(rb, this.shapeInfoDataBuffer());
    if (blocking)
        Nd4j.getExecutioner().push();
    // Nd4j.getExecutioner().commit();
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
    AllocationPoint srcPoint = allocator.getAllocationPoint(this);
    AllocationPoint dstPoint = allocator.getAllocationPoint(ret);
    int route = 0;
    if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE && srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        // d2d copy
        route = 1;
        NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getDevicePointer(), srcPoint.getDevicePointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyDeviceToDevice, blocking ? context.getOldStream() : context.getSpecialStream());
        dstPoint.tickDeviceWrite();
    } else if (dstPoint.getAllocationStatus() == AllocationStatus.HOST && srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
        route = 2;
        NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getHostPointer(), srcPoint.getDevicePointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyDeviceToHost, blocking ? context.getOldStream() : context.getSpecialStream());
        dstPoint.tickHostWrite();
    } else if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE && srcPoint.getAllocationStatus() == AllocationStatus.HOST) {
        route = 3;
        NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getDevicePointer(), srcPoint.getHostPointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, blocking ? context.getOldStream() : context.getSpecialStream());
        dstPoint.tickDeviceWrite();
    } else {
        route = 4;
        NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getHostPointer(), srcPoint.getHostPointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyHostToHost, blocking ? context.getOldStream() : context.getSpecialStream());
        dstPoint.tickHostWrite();
    }
    if (blocking)
        context.syncOldStream();
    else
        context.syncSpecialStream();
    /*
        long time2 = System.currentTimeMillis();

        long bytes = this.data.length() * this.data.getElementSize();
        long spent = time2 - time1;

        float bw = (1000 * bytes / spent) / 1024 / 1024.0f / 1024; //1000 / spent * bytes / 1024 / 1024 / 1024;

        log.info("Route: [{}]; Blocking: {}; {} bytes; {} ms; Bandwidth: {} GB/s", route, blocking, bytes, spent, String.format("%.2f", bw));
*/
    return ret;
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 13 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class AveragingTests method testMultiDeviceAveraging.

/**
 * This test should be run on multi-gpu system only. On single-gpu system this test will fail
 * @throws Exception
 */
@Test
public void testMultiDeviceAveraging() throws Exception {
    final List<Pair<INDArray, INDArray>> pairs = new ArrayList<>();
    int numDevices = Nd4j.getAffinityManager().getNumberOfDevices();
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    for (int i = 0; i < THREADS; i++) {
        final int order = i;
        Thread thread = new Thread(new Runnable() {

            @Override
            public void run() {
                pairs.add(new Pair<INDArray, INDArray>(Nd4j.valueArrayOf(LENGTH, (double) order), null));
                try {
                    Thread.sleep(100);
                } catch (Exception e) {
                // 
                }
            }
        });
        thread.start();
        thread.join();
    }
    assertEquals(THREADS, pairs.size());
    final List<INDArray> arrays = new ArrayList<>();
    AtomicBoolean hasNonZero = new AtomicBoolean(false);
    for (int i = 0; i < THREADS; i++) {
        INDArray array = pairs.get(i).getKey();
        AllocationPoint point = allocator.getAllocationPoint(array.data());
        if (point.getDeviceId() != 0)
            hasNonZero.set(true);
        arrays.add(array);
    }
    assertEquals(true, hasNonZero.get());
    /*
        // old way of averaging, without further propagation
        INDArray z = Nd4j.create(LENGTH);
        long time1 = System.currentTimeMillis();
        for (int i = 0; i < THREADS; i++) {
            z.addi(arrays.get(i));
        }
        z.divi((float) THREADS);
        CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
        context.syncOldStream();
        long time2 = System.currentTimeMillis();
        System.out.println("Execution time: " + (time2 - time1));

*/
    long time1 = System.currentTimeMillis();
    INDArray z = Nd4j.averageAndPropagate(arrays);
    long time2 = System.currentTimeMillis();
    System.out.println("Execution time: " + (time2 - time1));
    assertEquals(7.5f, z.getFloat(0), 0.01f);
    assertEquals(7.5f, z.getFloat(10), 0.01f);
    for (int i = 0; i < THREADS; i++) {
        for (int x = 0; x < LENGTH; x++) {
            assertEquals("Failed on array [" + i + "], element [" + x + "]", z.getFloat(0), arrays.get(i).getFloat(x), 0.01f);
        }
    }
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) ArrayList(java.util.ArrayList) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.nd4j.linalg.primitives.Pair) Test(org.junit.Test)

Example 14 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class DelayedMemoryTest method testDelayedAllocation2.

/**
 * This test should be run manually
 *
 * @throws Exception
 */
@Test
public void testDelayedAllocation2() throws Exception {
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    INDArray array = Nd4j.create(10, 10);
    AllocationPoint pointer = allocator.getAllocationPoint(array);
    PointersPair pair = pointer.getPointers();
    // pointers should be equal, device memory wasn't allocated yet
    assertEquals(pair.getDevicePointer(), pair.getHostPointer());
    // ////////////
    AllocationPoint shapePointer = allocator.getAllocationPoint(array.shapeInfoDataBuffer());
    // pointers should be equal, device memory wasn't allocated yet
    assertEquals(shapePointer.getPointers().getDevicePointer(), shapePointer.getPointers().getHostPointer());
    assertEquals(pointer.getAllocationStatus(), AllocationStatus.HOST);
    assertEquals(shapePointer.getAllocationStatus(), AllocationStatus.HOST);
    float sum = array.sumNumber().floatValue();
    assertEquals(0.0f, sum, 0.0001f);
    shapePointer = allocator.getAllocationPoint(array.shapeInfoDataBuffer());
    pointer = allocator.getAllocationPoint(array);
    assertEquals(AllocationStatus.CONSTANT, shapePointer.getAllocationStatus());
    assertEquals(AllocationStatus.DEVICE, pointer.getAllocationStatus());
    // at this point all pointers show be different, since we've used OP (sumNumber)
    assertNotEquals(shapePointer.getPointers().getDevicePointer(), shapePointer.getPointers().getHostPointer());
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) PointersPair(org.nd4j.jita.allocator.pointers.PointersPair) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) Test(org.junit.Test)

Aggregations

AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)14 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)11 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)10 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)8 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)6 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)5 CudaDoubleDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)5 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)4 Test (org.junit.Test)3 TADManager (org.nd4j.linalg.cache.TADManager)3 CompressedDataBuffer (org.nd4j.linalg.compression.CompressedDataBuffer)3 CudaIntDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer)3 ArrayList (java.util.ArrayList)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 PointersPair (org.nd4j.jita.allocator.pointers.PointersPair)1 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)1 PagedPointer (org.nd4j.linalg.api.memory.pointers.PagedPointer)1 GridPointers (org.nd4j.linalg.api.ops.grid.GridPointers)1 Pair (org.nd4j.linalg.primitives.Pair)1