use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaExecutioner method getEnvironmentInformation.
/**
* This method return set of key/value
* and key/key/value objects,
* describing current environment
*
* @return
*/
@Override
public synchronized Properties getEnvironmentInformation() {
if (properties == null) {
Properties props = super.getEnvironmentInformation();
List<Map<String, Object>> devicesList = new ArrayList<>();
for (int i = 0; i < nativeOps.getAvailableDevices(); i++) {
Map<String, Object> deviceProps = new HashMap<>();
CudaPointer devPtr = new CudaPointer(i);
deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_NAME_KEY, nativeOps.getDeviceName(devPtr));
deviceProps.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr));
deviceProps.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr));
deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MAJOR_VERSION_KEY, (long) nativeOps.getDeviceMajor(devPtr));
deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MINOR_VERSION_KEY, (long) nativeOps.getDeviceMinor(devPtr));
devicesList.add(i, deviceProps);
}
props.put(Nd4jEnvironment.BACKEND_KEY, "CUDA");
props.put(Nd4jEnvironment.CUDA_NUM_GPUS_KEY, nativeOps.getAvailableDevices());
props.put(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY, devicesList);
props.put(Nd4jEnvironment.BLAS_VENDOR_KEY, (Nd4j.factory().blas()).getBlasVendor().toString());
props.put(Nd4jEnvironment.HOST_FREE_MEMORY_KEY, Pointer.maxBytes() - Pointer.totalBytes());
properties = props;
} else {
List<Map<String, Object>> devicesList = (List<Map<String, Object>>) properties.get(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY);
for (int i = 0; i < nativeOps.getAvailableDevices(); i++) {
Map<String, Object> dev = devicesList.get(i);
CudaPointer devPtr = new CudaPointer(i);
dev.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr));
dev.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr));
}
properties.put(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY, devicesList);
properties.put(Nd4jEnvironment.HOST_FREE_MEMORY_KEY, Pointer.maxBytes() - Pointer.totalBytes());
}
return properties;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaExecutioner method exec.
@Override
public void exec(Aggregate op) {
int numArguments = op.getArguments().size();
int numShapeArguments = op.getShapes().size();
int numIndexArguments = op.getIndexingArguments().size();
int numIntArrays = op.getIntArrayArguments().size();
int numRealArguments = op.getRealArguments().size();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
PointerPointer extraArgs = new PointerPointer(32);
extraArgs.put(0, null);
extraArgs.put(1, context.getOldStream());
extraArgs.put(2, new CudaPointer(1));
extraArgs.put(3, new CudaPointer(op.getThreadsPerInstance()));
extraArgs.put(4, new CudaPointer(op.getSharedMemorySize()));
long[] arguments = new long[numArguments];
for (int x = 0; x < numArguments; x++) {
arguments[x] = op.getArguments().get(x) == null ? 0 : AtomicAllocator.getInstance().getPointer(op.getArguments().get(x), context).address();
if (op.getArguments().get(x) != null)
AtomicAllocator.getInstance().getAllocationPoint(op.getArguments().get(x)).tickDeviceWrite();
}
DataBuffer tempX = AllocationUtils.getPointersBuffer(arguments);
PointerPointer xPtr = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
long[] shapes = new long[numShapeArguments];
for (int x = 0; x < numShapeArguments; x++) {
shapes[x] = op.getShapes().get(x) == null ? 0 : AtomicAllocator.getInstance().getPointer(op.getShapes().get(x), context).address();
if (op.getShapes().get(x) != null)
AtomicAllocator.getInstance().getAllocationPoint(op.getShapes().get(x)).tickDeviceWrite();
}
DataBuffer tempS = AllocationUtils.getPointersBuffer(shapes);
PointerPointer sPtr = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempS, context));
long[] ints = new long[numIntArrays];
for (int x = 0; x < numIntArrays; x++) {
if (op.getIntArrayArguments().get(x) != null) {
DataBuffer intBuf = Nd4j.getDataBufferFactory().createInt(op.getIntArrayArguments().get(x));
ints[x] = AtomicAllocator.getInstance().getPointer(intBuf, context).address();
}
}
DataBuffer tempI = AllocationUtils.getPointersBuffer(ints);
PointerPointer iPtr = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempI, context));
int[] indexes = new int[numIndexArguments];
for (int x = 0; x < numIndexArguments; x++) {
indexes[x] = op.getIndexingArguments().get(x);
}
DataBuffer intBuffer = Nd4j.getDataBufferFactory().createInt(indexes);
double[] reals = new double[numRealArguments];
for (int x = 0; x < numRealArguments; x++) {
reals[x] = op.getRealArguments().get(x).doubleValue();
}
INDArray realsBuffer = Nd4j.create(reals);
if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
nativeOps.execAggregateFloat(extraArgs, op.opNum(), xPtr, numArguments, sPtr, numShapeArguments, (IntPointer) AtomicAllocator.getInstance().getPointer(intBuffer, context), numIndexArguments, iPtr, numIntArrays, (FloatPointer) AtomicAllocator.getInstance().getPointer(realsBuffer.data(), context), numRealArguments);
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.execAggregateDouble(extraArgs, op.opNum(), xPtr, numArguments, sPtr, numShapeArguments, (IntPointer) AtomicAllocator.getInstance().getPointer(intBuffer, context), numIndexArguments, iPtr, numIntArrays, (DoublePointer) AtomicAllocator.getInstance().getPointer(realsBuffer.data(), context), numRealArguments);
} else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
nativeOps.execAggregateHalf(extraArgs, op.opNum(), xPtr, numArguments, sPtr, numShapeArguments, (IntPointer) AtomicAllocator.getInstance().getPointer(intBuffer, context), numIndexArguments, iPtr, numIntArrays, (ShortPointer) AtomicAllocator.getInstance().getPointer(realsBuffer.data(), context), numRealArguments);
}
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class LimitedContextPool method fillPoolWithResources.
protected synchronized void fillPoolWithResources(int numResources, boolean restoreDevice) {
List<Integer> devices = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices();
int cDevice = 0;
if (restoreDevice) {
cDevice = AtomicAllocator.getInstance().getDeviceId();
}
NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps();
for (Integer device : devices) {
nativeOps.setDevice(new CudaPointer(device));
pool.put(device, new LinkedBlockingQueue<CudaContext>());
cublasHandle_t handle = createNewCublasHandle();
cusolverDnHandle_t solverHandle = createNewSolverHandle();
for (int cnt = 0; cnt < numResources; cnt++) {
CudaContext context = createNewStream(device);
context.initOldStream();
getDeviceBuffers(context, device);
context.setHandle(handle);
context.setSolverHandle(solverHandle);
context.syncOldStream();
pool.get(device).add(context);
}
}
if (restoreDevice) {
nativeOps.setDevice(new CudaPointer(cDevice));
}
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class BasicContextPool method createNewStream.
protected CudaContext createNewStream(Integer deviceId) {
log.debug("Creating new stream for thread: [{}], device: [{}]...", Thread.currentThread().getId(), deviceId);
// JCuda.cudaSetDevice(deviceId);
nativeOps.setDevice(new CudaPointer(deviceId));
CudaContext context = new CudaContext();
context.initOldStream();
return context;
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaCachingZeroProvider method malloc.
/**
* This method provides PointersPair to memory chunk specified by AllocationShape
*
* PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape.
*
* @param shape shape of desired memory chunk
* @param point target AllocationPoint structure
* @param location either HOST or DEVICE
* @return
*/
@Override
public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) {
long reqMemory = AllocationUtils.getRequiredMemory(shape);
if (location == AllocationStatus.HOST && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength()) {
CacheHolder cache = zeroCache.get(shape);
if (cache != null) {
Pointer pointer = cache.poll();
if (pointer != null) {
cacheZeroHit.incrementAndGet();
// since this memory chunk is going to be used now, remove it's amount from
zeroCachedAmount.addAndGet(-1 * reqMemory);
PointersPair pair = new PointersPair();
pair.setDevicePointer(new CudaPointer(pointer.address()));
pair.setHostPointer(new CudaPointer(pointer.address()));
point.setAllocationStatus(AllocationStatus.HOST);
return pair;
}
}
cacheZeroMiss.incrementAndGet();
if (CudaEnvironment.getInstance().getConfiguration().isUsePreallocation() && zeroCachedAmount.get() < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache() / 10 && reqMemory < 16 * 1024 * 1024L) {
CachePreallocator preallocator = new CachePreallocator(shape, location, CudaEnvironment.getInstance().getConfiguration().getPreallocationCalls());
preallocator.start();
}
cacheZeroMiss.incrementAndGet();
return super.malloc(shape, point, location);
}
return super.malloc(shape, point, location);
}
Aggregations