use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaMemoryManager method collect.
/**
* This method detaches off-heap memory from passed INDArray instances, and optionally stores them in cache for future reuse
* PLEASE NOTE: Cache options depend on specific implementations
*
* @param arrays
*/
@Override
public void collect(INDArray... arrays) {
// we basically want to free memory, without touching INDArray itself.
// so we don't care when gc is going to release object: memory is already cached
Nd4j.getExecutioner().commit();
int cnt = -1;
AtomicAllocator allocator = AtomicAllocator.getInstance();
for (INDArray array : arrays) {
cnt++;
// we don't collect views, since they don't have their own memory
if (array == null || array.isView())
continue;
AllocationPoint point = allocator.getAllocationPoint(array);
if (point.getAllocationStatus() == AllocationStatus.HOST)
allocator.getMemoryHandler().free(point, AllocationStatus.HOST);
else if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
allocator.getMemoryHandler().free(point, AllocationStatus.DEVICE);
allocator.getMemoryHandler().free(point, AllocationStatus.HOST);
} else if (point.getAllocationStatus() == AllocationStatus.DEALLOCATED) {
// do nothing
} else
throw new RuntimeException("Unknown AllocationStatus: " + point.getAllocationStatus() + " for argument: " + cnt);
point.setAllocationStatus(AllocationStatus.DEALLOCATED);
}
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaMemoryManager method memcpy.
/**
* This method provides basic memcpy functionality with respect to target environment
*
* @param dstBuffer
* @param srcBuffer
*/
@Override
public void memcpy(DataBuffer dstBuffer, DataBuffer srcBuffer) {
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
if (dstBuffer instanceof CompressedDataBuffer && !(srcBuffer instanceof CompressedDataBuffer)) {
// destination is compressed, source isn't
AllocationPoint srcPoint = AtomicAllocator.getInstance().getAllocationPoint(srcBuffer);
long size = srcBuffer.getElementSize() * srcBuffer.length();
if (!srcPoint.isActualOnHostSide()) {
// copying device -> host
AtomicAllocator.getInstance().synchronizeHostData(srcBuffer);
// Pointer src = AtomicAllocator.getInstance().getPointer(srcBuffer, context);
// NativeOpsHolder.getInstance().getDeviceNativeOps().memcpyAsync(dstBuffer.addressPointer(), src, size, 2, context.getSpecialStream());
// context.syncSpecialStream();
}
// else {
// copying host -> host
Pointer src = AtomicAllocator.getInstance().getHostPointer(srcBuffer);
Pointer.memcpy(dstBuffer.addressPointer(), src, size);
// }
} else if (!(dstBuffer instanceof CompressedDataBuffer) && srcBuffer instanceof CompressedDataBuffer) {
// destination is NOT compressed, source is compressed
AllocationPoint dstPoint = AtomicAllocator.getInstance().getAllocationPoint(dstBuffer);
long size = srcBuffer.getElementSize() * srcBuffer.length();
Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), size);
dstPoint.tickHostWrite();
} else if (dstBuffer instanceof CompressedDataBuffer && srcBuffer instanceof CompressedDataBuffer) {
// both buffers are compressed, just fire memcpy
Pointer.memcpy(dstBuffer.addressPointer(), srcBuffer.addressPointer(), srcBuffer.length() * srcBuffer.getElementSize());
} else {
// both buffers are NOT compressed
AtomicAllocator.getInstance().memcpy(dstBuffer, srcBuffer);
}
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class CudaFullCachingProvider method malloc.
/**
* This method provides PointersPair to memory chunk specified by AllocationShape
*
* PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
*
* @param shape shape of desired memory chunk
* @param point target AllocationPoint structure
* @param location either HOST or DEVICE
* @return
*/
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
long reqMemory = AllocationUtils.getRequiredMemory(shape);
if (location == AllocationStatus.DEVICE && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceAllocation()) {
int deviceId = AtomicAllocator.getInstance().getDeviceId();
ensureDeviceCacheHolder(deviceId, shape);
CacheHolder cache = deviceCache.get(deviceId).get(shape);
if (cache != null) {
Pointer pointer = cache.poll();
if (pointer != null) {
cacheDeviceHit.incrementAndGet();
deviceCachedAmount.get(deviceId).addAndGet(-1 * reqMemory);
PointersPair pair = new PointersPair();
pair.setDevicePointer(pointer);
point.setAllocationStatus(AllocationStatus.DEVICE);
point.setDeviceId(deviceId);
return pair;
}
}
cacheDeviceMiss.incrementAndGet();
return super.malloc(shape, point, location);
}
return super.malloc(shape, point, location);
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method accumulate.
public INDArray accumulate(INDArray target, INDArray... arrays) {
if (arrays == null || arrays.length == 0)
throw new RuntimeException("Input arrays are missing");
if (arrays.length == 1)
return target.assign(arrays[0]);
// we do averaging on GPU only if ALL devices have p2p links
if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) {
Nd4j.getExecutioner().push();
long len = target.lengthLong();
AtomicAllocator allocator = AtomicAllocator.getInstance();
CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
PointerPointer extras = new // not used
PointerPointer(// not used
null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
Pointer z = AtomicAllocator.getInstance().getPointer(target, context);
long[] xPointers = new long[arrays.length];
for (int i = 0; i < arrays.length; i++) {
if (arrays[i].elementWiseStride() != 1)
throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
if (arrays[i].lengthLong() != len)
throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
xPointers[i] = point.getPointers().getDevicePointer().address();
point.tickDeviceWrite();
}
CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len);
} else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len);
} else {
nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len);
}
allocator.getFlowController().registerAction(context, target, arrays);
tempX.address();
return target;
} else {
long len = target.lengthLong();
Nd4j.getExecutioner().commit();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
PointerPointer dataPointers = new PointerPointer(arrays.length);
PointerPointer extras = new // not used
PointerPointer(// not used
null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
for (int i = 0; i < arrays.length; i++) {
Nd4j.getCompressor().autoDecompress(arrays[i]);
if (arrays[i].elementWiseStride() != 1)
throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
if (arrays[i].lengthLong() != len)
throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
}
if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
} else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
} else {
nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
}
AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
return target;
}
}
use of org.nd4j.jita.allocator.impl.AllocationPoint in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method specialConcat.
@Override
public INDArray specialConcat(int dimension, INDArray... toConcat) {
if (toConcat.length == 1)
return toConcat[0];
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
PointerPointer shapeInfoPointers = new PointerPointer(toConcat.length);
PointerPointer dataPointers = new PointerPointer(toConcat.length);
AtomicAllocator allocator = AtomicAllocator.getInstance();
CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
int sumAlongDim = 0;
int[] outputShape = ArrayUtil.copy(toConcat[0].shape());
for (int i = 0; i < toConcat.length; i++) {
if (toConcat[i].isCompressed())
Nd4j.getCompressor().decompressi(toConcat[i]);
allocator.synchronizeHostData(toConcat[i]);
shapeInfoPointers.put(i, allocator.getHostPointer(toConcat[i].shapeInfoDataBuffer()));
dataPointers.put(i, allocator.getHostPointer(toConcat[i].data()));
sumAlongDim += toConcat[i].size(dimension);
for (int j = 0; j < toConcat[i].rank(); j++) if (j != dimension && toConcat[i].size(j) != outputShape[j]) {
throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
}
}
outputShape[dimension] = sumAlongDim;
PointerPointer dummy = new PointerPointer(new Pointer[] { null });
INDArray ret = Nd4j.createUninitialized(outputShape, Nd4j.order());
if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.specialConcatDouble(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (DoublePointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
} else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.specialConcatFloat(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (FloatPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
} else if (ret.data().dataType() == DataBuffer.Type.HALF) {
nativeOps.specialConcatHalf(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (ShortPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
} else {
throw new ND4JIllegalStateException("Unknown dataType: " + ret.data().dataType());
}
AllocationPoint point = allocator.getAllocationPoint(ret);
nativeOps.memcpyAsync(point.getDevicePointer(), point.getHostPointer(), ret.lengthLong() * Nd4j.sizeOfDataType(ret.data().dataType()), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream());
context.getSpecialStream().synchronize();
point.tickHostRead();
point.tickDeviceWrite();
return ret;
}
Aggregations