use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.
the class CudaGridExecutioner method pointerizeOp.
/**
* This method returns Op as set of required pointers for it
* @param op
* @param dimensions
* @return
*/
protected GridPointers pointerizeOp(Op op, int... dimensions) {
GridPointers pointers = new GridPointers(op, dimensions);
AtomicAllocator allocator = AtomicAllocator.getInstance();
// CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(op.z(), op.x(), op.y());
// FIXME: do not leave it as is
CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
pointers.setX(allocator.getPointer(op.x(), context));
pointers.setXShapeInfo(allocator.getPointer(op.x().shapeInfoDataBuffer(), context));
pointers.setZ(allocator.getPointer(op.z(), context));
pointers.setZShapeInfo(allocator.getPointer(op.z().shapeInfoDataBuffer(), context));
pointers.setZLength(op.z().length());
if (op.y() != null) {
pointers.setY(allocator.getPointer(op.y(), context));
pointers.setYShapeInfo(allocator.getPointer(op.y().shapeInfoDataBuffer(), context));
}
if (dimensions != null && dimensions.length > 0) {
DataBuffer dimensionBuffer = Nd4j.getConstantHandler().getConstantBuffer(dimensions);
pointers.setDimensions(allocator.getPointer(dimensionBuffer, context));
pointers.setDimensionsLength(dimensions.length);
}
// we build TADs
if (dimensions != null && dimensions.length > 0) {
Pair<DataBuffer, DataBuffer> tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), dimensions);
Pointer devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
Pointer devTadOffsets = tadBuffers.getSecond() == null ? null : AtomicAllocator.getInstance().getPointer(tadBuffers.getSecond(), context);
// we don't really care, if tadOffsets will be nulls
pointers.setTadShape(devTadShapeInfo);
pointers.setTadOffsets(devTadOffsets);
}
return pointers;
}
use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.
the class JCublasNDArray method unsafeDuplication.
@Override
public INDArray unsafeDuplication(boolean blocking) {
DataBuffer rb = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createSame(this.data, false) : Nd4j.getDataBufferFactory().createSame(this.data, false, Nd4j.getMemoryManager().getCurrentWorkspace());
INDArray ret = Nd4j.createArrayFromShapeBuffer(rb, this.shapeInfoDataBuffer());
if (blocking)
Nd4j.getExecutioner().push();
// Nd4j.getExecutioner().commit();
AtomicAllocator allocator = AtomicAllocator.getInstance();
CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
AllocationPoint srcPoint = allocator.getAllocationPoint(this);
AllocationPoint dstPoint = allocator.getAllocationPoint(ret);
int route = 0;
if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE && srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
// d2d copy
route = 1;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getDevicePointer(), srcPoint.getDevicePointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyDeviceToDevice, blocking ? context.getOldStream() : context.getSpecialStream());
dstPoint.tickDeviceWrite();
} else if (dstPoint.getAllocationStatus() == AllocationStatus.HOST && srcPoint.getAllocationStatus() == AllocationStatus.DEVICE) {
route = 2;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getHostPointer(), srcPoint.getDevicePointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyDeviceToHost, blocking ? context.getOldStream() : context.getSpecialStream());
dstPoint.tickHostWrite();
} else if (dstPoint.getAllocationStatus() == AllocationStatus.DEVICE && srcPoint.getAllocationStatus() == AllocationStatus.HOST) {
route = 3;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getDevicePointer(), srcPoint.getHostPointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyHostToDevice, blocking ? context.getOldStream() : context.getSpecialStream());
dstPoint.tickDeviceWrite();
} else {
route = 4;
NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstPoint.getHostPointer(), srcPoint.getHostPointer(), this.data.length() * this.data.getElementSize(), CudaConstants.cudaMemcpyHostToHost, blocking ? context.getOldStream() : context.getSpecialStream());
dstPoint.tickHostWrite();
}
if (blocking)
context.syncOldStream();
else
context.syncSpecialStream();
/*
long time2 = System.currentTimeMillis();
long bytes = this.data.length() * this.data.getElementSize();
long spent = time2 - time1;
float bw = (1000 * bytes / spent) / 1024 / 1024.0f / 1024; //1000 / spent * bytes / 1024 / 1024 / 1024;
log.info("Route: [{}]; Blocking: {}; {} bytes; {} ms; Bandwidth: {} GB/s", route, blocking, bytes, spent, String.format("%.2f", bw));
*/
return ret;
}
use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.
the class AveragingTests method testMultiDeviceAveraging.
/**
* This test should be run on multi-gpu system only. On single-gpu system this test will fail
* @throws Exception
*/
@Test
public void testMultiDeviceAveraging() throws Exception {
final List<Pair<INDArray, INDArray>> pairs = new ArrayList<>();
int numDevices = Nd4j.getAffinityManager().getNumberOfDevices();
AtomicAllocator allocator = AtomicAllocator.getInstance();
for (int i = 0; i < THREADS; i++) {
final int order = i;
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
pairs.add(new Pair<INDArray, INDArray>(Nd4j.valueArrayOf(LENGTH, (double) order), null));
try {
Thread.sleep(100);
} catch (Exception e) {
//
}
}
});
thread.start();
thread.join();
}
assertEquals(THREADS, pairs.size());
final List<INDArray> arrays = new ArrayList<>();
AtomicBoolean hasNonZero = new AtomicBoolean(false);
for (int i = 0; i < THREADS; i++) {
INDArray array = pairs.get(i).getKey();
AllocationPoint point = allocator.getAllocationPoint(array.data());
if (point.getDeviceId() != 0)
hasNonZero.set(true);
arrays.add(array);
}
assertEquals(true, hasNonZero.get());
/*
// old way of averaging, without further propagation
INDArray z = Nd4j.create(LENGTH);
long time1 = System.currentTimeMillis();
for (int i = 0; i < THREADS; i++) {
z.addi(arrays.get(i));
}
z.divi((float) THREADS);
CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
context.syncOldStream();
long time2 = System.currentTimeMillis();
System.out.println("Execution time: " + (time2 - time1));
*/
long time1 = System.currentTimeMillis();
INDArray z = Nd4j.averageAndPropagate(arrays);
long time2 = System.currentTimeMillis();
System.out.println("Execution time: " + (time2 - time1));
assertEquals(7.5f, z.getFloat(0), 0.01f);
assertEquals(7.5f, z.getFloat(10), 0.01f);
for (int i = 0; i < THREADS; i++) {
for (int x = 0; x < LENGTH; x++) {
assertEquals("Failed on array [" + i + "], element [" + x + "]", z.getFloat(0), arrays.get(i).getFloat(x), 0.01f);
}
}
}
use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.
the class DelayedMemoryTest method testDelayedAllocation2.
/**
* This test should be run manually
*
* @throws Exception
*/
@Test
public void testDelayedAllocation2() throws Exception {
AtomicAllocator allocator = AtomicAllocator.getInstance();
INDArray array = Nd4j.create(10, 10);
AllocationPoint pointer = allocator.getAllocationPoint(array);
PointersPair pair = pointer.getPointers();
// pointers should be equal, device memory wasn't allocated yet
assertEquals(pair.getDevicePointer(), pair.getHostPointer());
// ////////////
AllocationPoint shapePointer = allocator.getAllocationPoint(array.shapeInfoDataBuffer());
// pointers should be equal, device memory wasn't allocated yet
assertEquals(shapePointer.getPointers().getDevicePointer(), shapePointer.getPointers().getHostPointer());
assertEquals(pointer.getAllocationStatus(), AllocationStatus.HOST);
assertEquals(shapePointer.getAllocationStatus(), AllocationStatus.HOST);
float sum = array.sumNumber().floatValue();
assertEquals(0.0f, sum, 0.0001f);
shapePointer = allocator.getAllocationPoint(array.shapeInfoDataBuffer());
pointer = allocator.getAllocationPoint(array);
assertEquals(AllocationStatus.CONSTANT, shapePointer.getAllocationStatus());
assertEquals(AllocationStatus.DEVICE, pointer.getAllocationStatus());
// at this point all pointers show be different, since we've used OP (sumNumber)
assertNotEquals(shapePointer.getPointers().getDevicePointer(), shapePointer.getPointers().getHostPointer());
}
Aggregations