use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaCachingZeroProvider method free.
/**
* This method frees specific chunk of memory, described by AllocationPoint passed in.
*
* PLEASE NOTE: This method can actually ignore free, and keep released memory chunk for future reuse.
*
* @param point
*/
@Override
public void free(AllocationPoint point) {
if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
super.free(point);
} else {
AllocationShape shape = point.getShape();
long reqMemory = AllocationUtils.getRequiredMemory(shape);
// we don't cache too big objects
if (reqMemory > CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength() || zeroCachedAmount.get() >= CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache()) {
// log.info("HOST memory purging: {} bytes; MS: {}; MT: {}", reqMemory, MAX_SINGLE_ALLOCATION, MAX_CACHED_MEMORY);
super.free(point);
return;
}
ensureCacheHolder(shape);
// log.info("Saving DEVICE memory into cache...");
/*
Now we should decide if this object can be cached or not
*/
CacheHolder cache = zeroCache.get(shape);
// memory chunks < threshold will be cached no matter what
if (reqMemory <= FORCED_CACHE_THRESHOLD) {
Pointer.memset(point.getHostPointer(), 0, reqMemory);
cache.put(new CudaPointer(point.getHostPointer().address()));
} else {
long cacheEntries = cache.size();
long cacheHeight = zeroCache.size();
// total memory allocated within this bucket
long cacheDepth = cacheEntries * reqMemory;
// if (cacheDepth < MAX_CACHED_MEMORY / cacheHeight) {
Pointer.memset(point.getHostPointer(), 0, reqMemory);
cache.put(new CudaPointer(point.getHostPointer().address()));
// } else {
// super.free(point);
// }
}
}
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class JCublasNDArrayFactory method accumulate.
public INDArray accumulate(INDArray target, INDArray... arrays) {
if (arrays == null || arrays.length == 0)
throw new RuntimeException("Input arrays are missing");
if (arrays.length == 1)
return target.assign(arrays[0]);
// we do averaging on GPU only if ALL devices have p2p links
if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) {
Nd4j.getExecutioner().push();
long len = target.lengthLong();
AtomicAllocator allocator = AtomicAllocator.getInstance();
CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
PointerPointer extras = new // not used
PointerPointer(// not used
null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
Pointer z = AtomicAllocator.getInstance().getPointer(target, context);
long[] xPointers = new long[arrays.length];
for (int i = 0; i < arrays.length; i++) {
if (arrays[i].elementWiseStride() != 1)
throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
if (arrays[i].lengthLong() != len)
throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
xPointers[i] = point.getPointers().getDevicePointer().address();
point.tickDeviceWrite();
}
CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len);
} else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len);
} else {
nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len);
}
allocator.getFlowController().registerAction(context, target, arrays);
tempX.address();
return target;
} else {
long len = target.lengthLong();
Nd4j.getExecutioner().commit();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
PointerPointer dataPointers = new PointerPointer(arrays.length);
PointerPointer extras = new // not used
PointerPointer(// not used
null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
for (int i = 0; i < arrays.length; i++) {
Nd4j.getCompressor().autoDecompress(arrays[i]);
if (arrays[i].elementWiseStride() != 1)
throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
if (arrays[i].lengthLong() != len)
throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
}
if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
} else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
} else {
nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
}
AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
return target;
}
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaExecutioner method exec.
@Override
public <T extends Aggregate> void exec(Batch<T> batch) {
DataBuffer surfaceBuffer = getBuffer(batch);
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
IntPointer pointer = (IntPointer) new CudaPointer(AtomicAllocator.getInstance().getHostPointer(surfaceBuffer)).asIntPointer();
AllocationPoint surfacePoint = AtomicAllocator.getInstance().getAllocationPoint(surfaceBuffer);
int maxTypes = 5;
int maxIntArrays = batch.getSample().maxIntArrays();
int maxArraySize = batch.getSample().maxIntArraySize();
int indexPos = maxTypes * (Batch.getBatchLimit() * 16);
int intArraysPos = indexPos + (batch.getSample().maxIndexArguments() * (Batch.getBatchLimit() * 16));
int realPos = (intArraysPos + (maxIntArrays * maxArraySize * (Batch.getBatchLimit() * 16))) / (Nd4j.dataType() == DataBuffer.Type.DOUBLE ? 2 : 1);
if (Nd4j.dataType() == DataBuffer.Type.HALF)
realPos *= 2;
int argsPos = (realPos + (batch.getSample().maxRealArguments() * (Batch.getBatchLimit() * 16))) / (Nd4j.dataType() == DataBuffer.Type.FLOAT ? 2 : 1);
if (Nd4j.dataType() == DataBuffer.Type.HALF)
argsPos /= 4;
int shapesPos = argsPos + (batch.getSample().maxArguments() * (Batch.getBatchLimit() * 16));
for (int i = 0; i < batch.getNumAggregates(); i++) {
T op = batch.getAggregates().get(i);
// put num arguments
int idx = i * maxTypes;
pointer.put(idx, op.getArguments().size());
pointer.put(idx + 1, op.getShapes().size());
pointer.put(idx + 2, op.getIndexingArguments().size());
pointer.put(idx + 3, op.getRealArguments().size());
pointer.put(idx + 4, op.getIntArrayArguments().size());
// putting indexing arguments
for (int e = 0; e < op.getIndexingArguments().size(); e++) {
idx = indexPos + i * batch.getSample().maxIndexArguments();
pointer.put(idx + e, op.getIndexingArguments().get(e));
}
// putting intArray values
int bsize = maxIntArrays * maxArraySize;
for (int e = 0; e < op.getIntArrayArguments().size(); e++) {
int step = (i * bsize) + (e * maxArraySize);
if (op.getIntArrayArguments().get(e) != null)
for (int x = 0; x < op.getIntArrayArguments().get(e).length; x++) {
idx = intArraysPos + step + x;
pointer.put(idx, op.getIntArrayArguments().get(e)[x]);
}
}
// putting real arguments
if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
FloatPointer realPtr = new FloatPointer(pointer);
for (int e = 0; e < op.getRealArguments().size(); e++) {
idx = realPos + i * op.maxRealArguments();
realPtr.put(idx + e, op.getRealArguments().get(e).floatValue());
}
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
DoublePointer dPtr = new DoublePointer(pointer);
for (int e = 0; e < op.getRealArguments().size(); e++) {
idx = realPos + (i * op.maxRealArguments());
dPtr.put(idx + e, op.getRealArguments().get(e).doubleValue());
}
} else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
ShortPointer sPtr = new ShortPointer(pointer);
for (int e = 0; e < op.getRealArguments().size(); e++) {
idx = realPos + (i * op.maxRealArguments());
sPtr.put(idx + e, BaseDataBuffer.fromFloat(op.getRealArguments().get(e).floatValue()));
}
}
// putting arguments pointers
PointerPointer ptrPtr = new PointerPointer(pointer);
for (int e = 0; e < op.getArguments().size(); e++) {
idx = argsPos + i * batch.getSample().maxArguments();
if (op.getArguments().get(e) != null) {
ptrPtr.put(idx + e, AtomicAllocator.getInstance().getPointer(op.getArguments().get(e), context));
AtomicAllocator.getInstance().getAllocationPoint(op.getArguments().get(e)).tickDeviceWrite();
}
}
// putting shape pointers
for (int e = 0; e < op.getShapes().size(); e++) {
idx = shapesPos + i * batch.getSample().maxShapes();
if (op.getShapes().get(e) != null) {
ptrPtr.put(idx + e, AtomicAllocator.getInstance().getPointer(op.getShapes().get(e), context));
AtomicAllocator.getInstance().getAllocationPoint(op.getShapes().get(e)).tickDeviceWrite();
}
}
}
// trigger write, so getPointer request will force relocation to GPU
surfacePoint.tickHostWrite();
PointerPointer extraArgs = new PointerPointer(32);
extraArgs.put(0, null);
extraArgs.put(1, context.getOldStream());
extraArgs.put(2, new CudaPointer(Math.min(batch.getNumAggregates(), CudaEnvironment.getInstance().getConfiguration().getMaximumGridSize())));
extraArgs.put(3, new CudaPointer(batch.getSample().getThreadsPerInstance()));
extraArgs.put(4, new CudaPointer(batch.getSample().getSharedMemorySize()));
if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
nativeOps.execAggregateBatchFloat(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
nativeOps.execAggregateBatchDouble(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
} else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
nativeOps.execAggregateBatchHalf(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
}
surfacePoint.tickHostWrite();
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaExecutioner method exec.
/**
* This method executes given CustomOp
*
* PLEASE NOTE: You're responsible for input/output validation
* PLEASE NOTE: right now this operations are executing on CPU
* @param op
*/
public void exec(CustomOp op) {
Nd4j.getExecutioner().commit();
if (op.opName().equalsIgnoreCase("im2col")) {
val dtype = Nd4j.dataType();
val xArr = op.inputArguments()[0];
val zArr = op.outputArguments()[0];
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer xShapeHost = // 0
extraz.get().put(// 0
AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 1
context.getOldStream(), // 2
AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
context.getBufferAllocation(), // 4
context.getBufferReduction(), // 5
context.getBufferScalar(), context.getBufferSpecial(), null, AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer()));
val x = AtomicAllocator.getInstance().getPointer(xArr, context);
val z = AtomicAllocator.getInstance().getPointer(zArr, context);
val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
double zeroPad = 0.0;
if (op.tArgs() != null && op.tArgs().length > 0) {
zeroPad = op.tArgs()[0];
}
val extrass = new double[] { op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7], op.iArgs()[8], zeroPad };
val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass);
val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
if (dtype == DataBuffer.Type.DOUBLE) {
nativeOps.execTransformDouble(xShapeHost, 37, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, (DoublePointer) extraArgs);
} else if (dtype == DataBuffer.Type.FLOAT) {
nativeOps.execTransformFloat(xShapeHost, 37, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, (FloatPointer) extraArgs);
} else if (dtype == DataBuffer.Type.HALF) {
nativeOps.execTransformHalf(xShapeHost, 37, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, (ShortPointer) extraArgs);
}
// AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
return;
} else if (op.opName().equalsIgnoreCase("col2im")) {
val dtype = Nd4j.dataType();
val xArr = op.inputArguments()[0];
val zArr = op.outputArguments()[0];
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer xShapeHost = // 0
extraz.get().put(// 0
AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 1
context.getOldStream(), // 2
AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
context.getBufferAllocation(), // 4
context.getBufferReduction(), // 5
context.getBufferScalar(), context.getBufferSpecial(), null, AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer()));
val x = AtomicAllocator.getInstance().getPointer(xArr, context);
val z = AtomicAllocator.getInstance().getPointer(zArr, context);
val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
val extrass = new double[] { op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7] };
val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass);
val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
if (dtype == DataBuffer.Type.DOUBLE) {
nativeOps.execTransformDouble(xShapeHost, 36, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, (DoublePointer) extraArgs);
} else if (dtype == DataBuffer.Type.FLOAT) {
nativeOps.execTransformFloat(xShapeHost, 36, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, (FloatPointer) extraArgs);
} else if (dtype == DataBuffer.Type.HALF) {
nativeOps.execTransformHalf(xShapeHost, 36, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, (ShortPointer) extraArgs);
}
// AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
return;
} else if (op.opName().equalsIgnoreCase("pooling2d")) {
val dtype = Nd4j.dataType();
val xArr = op.inputArguments()[0];
val zArr = op.outputArguments()[0];
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer xShapeHost = // 0
extraz.get().put(// 0
AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 1
context.getOldStream(), // 2
AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
context.getBufferAllocation(), // 4
context.getBufferReduction(), // 5
context.getBufferScalar(), context.getBufferSpecial(), null, AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer()));
val x = AtomicAllocator.getInstance().getPointer(xArr, context);
val z = AtomicAllocator.getInstance().getPointer(zArr, context);
val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
val extrass = new double[] { op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7], op.iArgs()[8] };
val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass);
val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
if (dtype == DataBuffer.Type.DOUBLE) {
nativeOps.execTransformDouble(xShapeHost, 71, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, (DoublePointer) extraArgs);
} else if (dtype == DataBuffer.Type.FLOAT) {
nativeOps.execTransformFloat(xShapeHost, 71, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, (FloatPointer) extraArgs);
} else if (dtype == DataBuffer.Type.HALF) {
nativeOps.execTransformHalf(xShapeHost, 71, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, (ShortPointer) extraArgs);
}
// AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
return;
}
Nd4j.getExecutioner().commit();
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer extras = extraz.get().put(new CudaPointer(1), context.getOldStream(), context.getBufferScalar(), context.getBufferReduction());
val outputArgs = op.outputArguments();
val inputArgs = op.inputArguments();
if (outputArgs.length == 0 && !op.isInplaceCall())
throw new ND4JIllegalStateException("You can't execute non-inplace CustomOp without outputs being specified");
val lc = op.opName().toLowerCase();
val hash = op.opHash();
val inputShapes = new PointerPointer<>(inputArgs.length * 2);
val inputBuffers = new PointerPointer<>(inputArgs.length * 2);
int cnt = 0;
for (val in : inputArgs) {
val hp = AtomicAllocator.getInstance().getHostPointer(in.shapeInfoDataBuffer());
inputBuffers.put(cnt, AtomicAllocator.getInstance().getHostPointer(in));
inputShapes.put(cnt, hp);
val dp = AtomicAllocator.getInstance().getPointer(in.shapeInfoDataBuffer(), context);
inputBuffers.put(cnt + inputArgs.length, AtomicAllocator.getInstance().getPointer(in, context));
inputShapes.put(cnt + inputArgs.length, dp);
if (op.isInplaceCall())
AtomicAllocator.getInstance().getAllocationPoint(in).tickHostWrite();
cnt++;
}
val outputShapes = new PointerPointer<>(outputArgs.length * 2);
val outputBuffers = new PointerPointer<>(outputArgs.length * 2);
cnt = 0;
for (val out : outputArgs) {
outputBuffers.put(cnt, AtomicAllocator.getInstance().getHostPointer(out));
outputShapes.put(cnt, AtomicAllocator.getInstance().getHostPointer(out.shapeInfoDataBuffer()));
outputBuffers.put(cnt + outputArgs.length, AtomicAllocator.getInstance().getPointer(out, context));
outputShapes.put(cnt + outputArgs.length, AtomicAllocator.getInstance().getPointer(out.shapeInfoDataBuffer(), context));
AtomicAllocator.getInstance().getAllocationPoint(out).tickHostWrite();
cnt++;
}
if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
val tArgs = op.tArgs().length > 0 ? new FloatPointer(op.tArgs().length) : null;
val iArgs = op.iArgs().length > 0 ? new IntPointer(op.iArgs().length) : null;
cnt = 0;
for (val t : op.tArgs()) tArgs.put(cnt++, (float) t);
cnt = 0;
for (val i : op.iArgs()) iArgs.put(cnt++, i);
val status = OpStatus.byNumber(nativeOps.execCustomOpFloat(extras, hash, inputBuffers, inputShapes, inputArgs.length, outputBuffers, outputShapes, outputArgs.length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, op.isInplaceCall()));
if (status != OpStatus.ND4J_STATUS_OK)
throw new ND4JIllegalStateException("Op execution failed: " + status);
} else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
val tArgs = op.tArgs().length > 0 ? new DoublePointer(op.tArgs().length) : null;
val iArgs = op.iArgs().length > 0 ? new IntPointer(op.iArgs().length) : null;
cnt = 0;
for (val t : op.tArgs()) tArgs.put(cnt++, t);
for (val i : op.iArgs()) iArgs.put(cnt++, i);
val status = OpStatus.byNumber(nativeOps.execCustomOpDouble(extras, hash, inputBuffers, inputShapes, inputArgs.length, outputBuffers, outputShapes, outputArgs.length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, op.isInplaceCall()));
if (status != OpStatus.ND4J_STATUS_OK)
throw new ND4JIllegalStateException("Op execution failed: " + status);
} else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
val tArgs = op.tArgs().length > 0 ? new ShortPointer(op.tArgs().length) : null;
val iArgs = op.iArgs().length > 0 ? new IntPointer(op.iArgs().length) : null;
cnt = 0;
for (val t : op.tArgs()) tArgs.put(cnt++, ArrayUtil.toHalf((float) t));
cnt = 0;
for (val i : op.iArgs()) iArgs.put(cnt++, i);
val status = OpStatus.byNumber(nativeOps.execCustomOpHalf(extras, hash, inputBuffers, inputShapes, inputArgs.length, outputBuffers, outputShapes, outputArgs.length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, op.isInplaceCall()));
if (status != OpStatus.ND4J_STATUS_OK)
throw new ND4JIllegalStateException("Op execution failed: " + status);
}
// AtomicAllocator.getInstance().getFlowController().prepareActionAllWrite(op.outputArguments());
}
use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.
the class CudaExecutioner method invoke.
protected CudaContext invoke(TransformOp op) {
long st = profilingHookIn(op);
checkForCompression(op);
validateDataType(Nd4j.dataType(), op);
AtomicAllocator allocator = AtomicAllocator.getInstance();
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
// Pow operations might be special
if (op.opNum() == 7) {
if (op.y() != null && op.y().isScalar()) {
Nd4j.getExecutioner().commit();
op.setY(Nd4j.valueArrayOf(op.x().shape(), op.y().getDouble(0)));
Nd4j.getExecutioner().commit();
}
}
CudaContext context = allocator.getFlowController().prepareAction(op.z(), op.x(), op.y());
if (CudaEnvironment.getInstance().getConfiguration().isDebug())
lastOp.set(op.opName());
// special temp array for IsMax along dimension
INDArray ret = null;
Pointer x = allocator.getPointer(op.x(), context);
Pointer xShapeInfo = allocator.getPointer(op.x().shapeInfoDataBuffer(), context);
Pointer extraArgs = op.extraArgs() != null ? allocator.getPointer(op.extraArgsDataBuff(), context) : null;
Pointer hostYShapeInfo = op.y() == null ? null : AddressRetriever.retrieveHostPointer(op.y().shapeInfoDataBuffer());
Pointer hostZShapeInfo = op.z() == null ? null : AddressRetriever.retrieveHostPointer(op.z().shapeInfoDataBuffer());
Pointer dimensionDevPointer = null;
Pointer dimensionHostPointer = null;
Pointer retPointer = null;
int[] dimension = null;
if (op.opNum() == 41 && op.extraArgs() != null) {
// for IsMax along dimension we need special temporary buffer
dimension = new int[(int) op.extraArgs()[0]];
for (int i = 0; i < dimension.length; i++) {
dimension[i] = (int) op.extraArgs()[i + 1];
}
for (int i = 0; i < dimension.length; i++) {
if (dimension[i] < 0)
dimension[i] += op.x().rank();
}
// do op along all dimensions
if (dimension.length == op.x().rank())
dimension = new int[] { Integer.MAX_VALUE };
int[] retShape = Shape.wholeArrayDimension(dimension) ? new int[] { 1, 1 } : ArrayUtil.removeIndex(op.x().shape(), dimension);
// ensure vector is proper shape
if (retShape.length == 1) {
if (dimension[0] == 0)
retShape = new int[] { 1, retShape[0] };
else
retShape = new int[] { retShape[0], 1 };
} else if (retShape.length == 0) {
retShape = new int[] { 1, 1 };
}
ret = Nd4j.zeros(retShape);
// FIXME: this maybe misleading use of this particular pointer
hostYShapeInfo = allocator.getPointer(ret.shapeInfoDataBuffer(), context);
// dimensionPointer = AtomicAllocator.getInstance().getPointer(Nd4j.createBuffer(dimension), context);
DataBuffer dimensionBuffer = allocator.getConstantBuffer(dimension);
dimensionDevPointer = allocator.getPointer(dimensionBuffer, context);
dimensionHostPointer = allocator.getHostPointer(dimensionBuffer);
retPointer = allocator.getPointer(ret, context);
}
Pointer hostTadShapeInfo = null;
Pointer devTadShapeInfo = null;
Pointer hostMaxTadShapeInfo = null;
Pointer devMaxTadShapeInfo = null;
Pair<DataBuffer, DataBuffer> tadBuffers;
Pair<DataBuffer, DataBuffer> tadMaxBuffers;
Pointer devTadOffsets = null;
Pointer devMaxTadOffsets = null;
if (op.opNum() >= 38 && op.opNum() <= 41) {
if (op.opNum() != 41) {
tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), new int[] { 0 });
tadMaxBuffers = tadManager.getTADOnlyShapeInfo(op.x(), new int[] { 1 });
hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
devTadShapeInfo = allocator.getPointer(tadBuffers.getFirst(), context);
hostMaxTadShapeInfo = AddressRetriever.retrieveHostPointer(tadMaxBuffers.getFirst());
devMaxTadShapeInfo = allocator.getPointer(tadMaxBuffers.getFirst(), context);
DataBuffer offsets = tadBuffers.getSecond();
devTadOffsets = offsets == null ? null : allocator.getPointer(offsets, context);
DataBuffer maxOffsets = tadMaxBuffers.getSecond();
devMaxTadOffsets = maxOffsets == null ? null : allocator.getPointer(maxOffsets, context);
} else {
tadBuffers = tadManager.getTADOnlyShapeInfo(op.z(), dimension);
hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
DataBuffer offsets = tadBuffers.getSecond();
devTadOffsets = offsets == null ? null : allocator.getPointer(offsets, context);
}
}
Pointer z = allocator.getPointer(op.z(), context);
Pointer zShapeInfo = allocator.getPointer(op.z().shapeInfoDataBuffer(), context);
PointerPointer xShapeInfoHostPointer = // 0
extraz.get().put(// 0
AddressRetriever.retrieveHostPointer(op.x().shapeInfoDataBuffer()), // 1
context.getOldStream(), // 2
allocator.getDeviceIdPointer(), // 3
context.getBufferAllocation(), // 4
context.getBufferReduction(), // 5
context.getBufferScalar(), // 6
context.getBufferSpecial(), // 7
hostYShapeInfo, // 8
hostZShapeInfo, // 9
hostTadShapeInfo, // 10
devTadShapeInfo, // 11
devTadOffsets, // 12
hostMaxTadShapeInfo, // 13
devMaxTadShapeInfo, // 14
devMaxTadOffsets, // special pointer for IsMax // 15
dimensionDevPointer, // special pointer for IsMax // 16
dimensionHostPointer, // special pointer for IsMax // 17
retPointer, new CudaPointer(dimension == null ? 0 : dimension.length));
if (op.y() != null) {
Pointer y = allocator.getPointer(op.y(), context);
Pointer yShapeInfo = allocator.getPointer(op.y().shapeInfoDataBuffer(), context);
int xEWS = op.x().elementWiseStride();
int yEWS = op.y().elementWiseStride();
int zEWS = op.z().elementWiseStride();
boolean xRow = op.x().isRowVector();
boolean yRow = op.y().isRowVector();
boolean zRow = op.z().isRowVector();
if (op.x().data().dataType() == DataBuffer.Type.DOUBLE) {
if ((xEWS >= 1 && yEWS >= 1 && zEWS >= 1 && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
nativeOps.execPairwiseTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, xEWS, (DoublePointer) y, yEWS, (DoublePointer) z, zEWS, (DoublePointer) extraArgs, op.n());
} else {
nativeOps.execPairwiseTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, (IntPointer) xShapeInfo, (DoublePointer) y, (IntPointer) yShapeInfo, (DoublePointer) z, (IntPointer) zShapeInfo, (DoublePointer) extraArgs);
}
} else if (op.x().data().dataType() == DataBuffer.Type.FLOAT) {
if ((xEWS >= 1 && yEWS >= 1 && xEWS == yEWS && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
nativeOps.execPairwiseTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, xEWS, (FloatPointer) y, yEWS, (FloatPointer) z, zEWS, (FloatPointer) extraArgs, op.n());
} else {
nativeOps.execPairwiseTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, (IntPointer) xShapeInfo, (FloatPointer) y, (IntPointer) yShapeInfo, (FloatPointer) z, (IntPointer) zShapeInfo, (FloatPointer) extraArgs);
}
} else {
if ((xEWS >= 1 && yEWS >= 1 && xEWS == op.y().elementWiseStride() && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
nativeOps.execPairwiseTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, xEWS, (ShortPointer) y, yEWS, (ShortPointer) z, zEWS, (ShortPointer) extraArgs, op.n());
} else {
nativeOps.execPairwiseTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, (IntPointer) xShapeInfo, (ShortPointer) y, (IntPointer) yShapeInfo, (ShortPointer) z, (IntPointer) zShapeInfo, (ShortPointer) extraArgs);
}
}
} else {
if (op.x().data().dataType() == DataBuffer.Type.DOUBLE) {
if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
nativeOps.execTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, op.x().elementWiseStride(), (DoublePointer) z, op.z().elementWiseStride(), (DoublePointer) extraArgs, op.n());
} else {
nativeOps.execTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, (IntPointer) xShapeInfo, (DoublePointer) z, (IntPointer) zShapeInfo, (DoublePointer) extraArgs);
}
} else if (op.x().data().dataType() == DataBuffer.Type.FLOAT) {
if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
nativeOps.execTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, op.x().elementWiseStride(), (FloatPointer) z, op.z().elementWiseStride(), (FloatPointer) extraArgs, op.n());
} else {
nativeOps.execTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, (IntPointer) xShapeInfo, (FloatPointer) z, (IntPointer) zShapeInfo, (FloatPointer) extraArgs);
}
} else {
if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
nativeOps.execTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, op.x().elementWiseStride(), (ShortPointer) z, op.z().elementWiseStride(), (ShortPointer) extraArgs, op.n());
} else {
nativeOps.execTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, (IntPointer) xShapeInfo, (ShortPointer) z, (IntPointer) zShapeInfo, (ShortPointer) extraArgs);
}
}
}
AtomicAllocator.getInstance().registerAction(context, op.z(), op.x(), op.y());
if (extraArgs != null)
extraArgs.address();
if (ret != null)
ret.elementWiseStride();
profilingHookOut(op, st);
return null;
}
Aggregations