Search in sources :

Example 26 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaCachingZeroProvider method free.

/**
 * This method frees specific chunk of memory, described by AllocationPoint passed in.
 *
 * PLEASE NOTE: This method can actually ignore free, and keep released memory chunk for future reuse.
 *
 * @param point
 */
@Override
public void free(AllocationPoint point) {
    if (point.getAllocationStatus() == AllocationStatus.DEVICE) {
        super.free(point);
    } else {
        AllocationShape shape = point.getShape();
        long reqMemory = AllocationUtils.getRequiredMemory(shape);
        // we don't cache too big objects
        if (reqMemory > CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength() || zeroCachedAmount.get() >= CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache()) {
            // log.info("HOST memory purging: {} bytes; MS: {}; MT: {}", reqMemory, MAX_SINGLE_ALLOCATION, MAX_CACHED_MEMORY);
            super.free(point);
            return;
        }
        ensureCacheHolder(shape);
        // log.info("Saving DEVICE memory into cache...");
        /*
                Now we should decide if this object can be cached or not
             */
        CacheHolder cache = zeroCache.get(shape);
        // memory chunks < threshold will be cached no matter what
        if (reqMemory <= FORCED_CACHE_THRESHOLD) {
            Pointer.memset(point.getHostPointer(), 0, reqMemory);
            cache.put(new CudaPointer(point.getHostPointer().address()));
        } else {
            long cacheEntries = cache.size();
            long cacheHeight = zeroCache.size();
            // total memory allocated within this bucket
            long cacheDepth = cacheEntries * reqMemory;
            // if (cacheDepth < MAX_CACHED_MEMORY / cacheHeight) {
            Pointer.memset(point.getHostPointer(), 0, reqMemory);
            cache.put(new CudaPointer(point.getHostPointer().address()));
        // } else {
        // super.free(point);
        // }
        }
    }
}
Also used : AllocationShape(org.nd4j.jita.allocator.impl.AllocationShape) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 27 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method accumulate.

public INDArray accumulate(INDArray target, INDArray... arrays) {
    if (arrays == null || arrays.length == 0)
        throw new RuntimeException("Input arrays are missing");
    if (arrays.length == 1)
        return target.assign(arrays[0]);
    // we do averaging on GPU only if ALL devices have p2p links
    if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) {
        Nd4j.getExecutioner().push();
        long len = target.lengthLong();
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
        Pointer z = AtomicAllocator.getInstance().getPointer(target, context);
        long[] xPointers = new long[arrays.length];
        for (int i = 0; i < arrays.length; i++) {
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
            xPointers[i] = point.getPointers().getDevicePointer().address();
            point.tickDeviceWrite();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
        allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
        PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len);
        } else {
            nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len);
        }
        allocator.getFlowController().registerAction(context, target, arrays);
        tempX.address();
        return target;
    } else {
        long len = target.lengthLong();
        Nd4j.getExecutioner().commit();
        CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer(arrays.length);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
        for (int i = 0; i < arrays.length; i++) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else {
            nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        }
        AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        return target;
    }
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 28 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaExecutioner method exec.

@Override
public <T extends Aggregate> void exec(Batch<T> batch) {
    DataBuffer surfaceBuffer = getBuffer(batch);
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    IntPointer pointer = (IntPointer) new CudaPointer(AtomicAllocator.getInstance().getHostPointer(surfaceBuffer)).asIntPointer();
    AllocationPoint surfacePoint = AtomicAllocator.getInstance().getAllocationPoint(surfaceBuffer);
    int maxTypes = 5;
    int maxIntArrays = batch.getSample().maxIntArrays();
    int maxArraySize = batch.getSample().maxIntArraySize();
    int indexPos = maxTypes * (Batch.getBatchLimit() * 16);
    int intArraysPos = indexPos + (batch.getSample().maxIndexArguments() * (Batch.getBatchLimit() * 16));
    int realPos = (intArraysPos + (maxIntArrays * maxArraySize * (Batch.getBatchLimit() * 16))) / (Nd4j.dataType() == DataBuffer.Type.DOUBLE ? 2 : 1);
    if (Nd4j.dataType() == DataBuffer.Type.HALF)
        realPos *= 2;
    int argsPos = (realPos + (batch.getSample().maxRealArguments() * (Batch.getBatchLimit() * 16))) / (Nd4j.dataType() == DataBuffer.Type.FLOAT ? 2 : 1);
    if (Nd4j.dataType() == DataBuffer.Type.HALF)
        argsPos /= 4;
    int shapesPos = argsPos + (batch.getSample().maxArguments() * (Batch.getBatchLimit() * 16));
    for (int i = 0; i < batch.getNumAggregates(); i++) {
        T op = batch.getAggregates().get(i);
        // put num arguments
        int idx = i * maxTypes;
        pointer.put(idx, op.getArguments().size());
        pointer.put(idx + 1, op.getShapes().size());
        pointer.put(idx + 2, op.getIndexingArguments().size());
        pointer.put(idx + 3, op.getRealArguments().size());
        pointer.put(idx + 4, op.getIntArrayArguments().size());
        // putting indexing arguments
        for (int e = 0; e < op.getIndexingArguments().size(); e++) {
            idx = indexPos + i * batch.getSample().maxIndexArguments();
            pointer.put(idx + e, op.getIndexingArguments().get(e));
        }
        // putting intArray values
        int bsize = maxIntArrays * maxArraySize;
        for (int e = 0; e < op.getIntArrayArguments().size(); e++) {
            int step = (i * bsize) + (e * maxArraySize);
            if (op.getIntArrayArguments().get(e) != null)
                for (int x = 0; x < op.getIntArrayArguments().get(e).length; x++) {
                    idx = intArraysPos + step + x;
                    pointer.put(idx, op.getIntArrayArguments().get(e)[x]);
                }
        }
        // putting real arguments
        if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
            FloatPointer realPtr = new FloatPointer(pointer);
            for (int e = 0; e < op.getRealArguments().size(); e++) {
                idx = realPos + i * op.maxRealArguments();
                realPtr.put(idx + e, op.getRealArguments().get(e).floatValue());
            }
        } else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
            DoublePointer dPtr = new DoublePointer(pointer);
            for (int e = 0; e < op.getRealArguments().size(); e++) {
                idx = realPos + (i * op.maxRealArguments());
                dPtr.put(idx + e, op.getRealArguments().get(e).doubleValue());
            }
        } else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
            ShortPointer sPtr = new ShortPointer(pointer);
            for (int e = 0; e < op.getRealArguments().size(); e++) {
                idx = realPos + (i * op.maxRealArguments());
                sPtr.put(idx + e, BaseDataBuffer.fromFloat(op.getRealArguments().get(e).floatValue()));
            }
        }
        // putting arguments pointers
        PointerPointer ptrPtr = new PointerPointer(pointer);
        for (int e = 0; e < op.getArguments().size(); e++) {
            idx = argsPos + i * batch.getSample().maxArguments();
            if (op.getArguments().get(e) != null) {
                ptrPtr.put(idx + e, AtomicAllocator.getInstance().getPointer(op.getArguments().get(e), context));
                AtomicAllocator.getInstance().getAllocationPoint(op.getArguments().get(e)).tickDeviceWrite();
            }
        }
        // putting shape pointers
        for (int e = 0; e < op.getShapes().size(); e++) {
            idx = shapesPos + i * batch.getSample().maxShapes();
            if (op.getShapes().get(e) != null) {
                ptrPtr.put(idx + e, AtomicAllocator.getInstance().getPointer(op.getShapes().get(e), context));
                AtomicAllocator.getInstance().getAllocationPoint(op.getShapes().get(e)).tickDeviceWrite();
            }
        }
    }
    // trigger write, so getPointer request will force relocation to GPU
    surfacePoint.tickHostWrite();
    PointerPointer extraArgs = new PointerPointer(32);
    extraArgs.put(0, null);
    extraArgs.put(1, context.getOldStream());
    extraArgs.put(2, new CudaPointer(Math.min(batch.getNumAggregates(), CudaEnvironment.getInstance().getConfiguration().getMaximumGridSize())));
    extraArgs.put(3, new CudaPointer(batch.getSample().getThreadsPerInstance()));
    extraArgs.put(4, new CudaPointer(batch.getSample().getSharedMemorySize()));
    if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.execAggregateBatchFloat(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
    } else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.execAggregateBatchDouble(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
    } else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
        nativeOps.execAggregateBatchHalf(extraArgs, batch.getNumAggregates(), batch.opNum(), batch.getSample().maxArguments(), batch.getSample().maxShapes(), batch.getSample().maxIntArrays(), batch.getSample().maxIntArraySize(), batch.getSample().maxIndexArguments(), batch.getSample().maxRealArguments(), AtomicAllocator.getInstance().getPointer(surfaceBuffer, context));
    }
    surfacePoint.tickHostWrite();
}
Also used : CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) BaseDataBuffer(org.nd4j.linalg.api.buffer.BaseDataBuffer)

Example 29 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaExecutioner method exec.

/**
 * This method executes given CustomOp
 *
 * PLEASE NOTE: You're responsible for input/output validation
 * PLEASE NOTE: right now this operations are executing on CPU
 * @param op
 */
public void exec(CustomOp op) {
    Nd4j.getExecutioner().commit();
    if (op.opName().equalsIgnoreCase("im2col")) {
        val dtype = Nd4j.dataType();
        val xArr = op.inputArguments()[0];
        val zArr = op.outputArguments()[0];
        CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        PointerPointer xShapeHost = // 0
        extraz.get().put(// 0
        AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 1
        context.getOldStream(), // 2
        AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
        context.getBufferAllocation(), // 4
        context.getBufferReduction(), // 5
        context.getBufferScalar(), context.getBufferSpecial(), null, AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer()));
        val x = AtomicAllocator.getInstance().getPointer(xArr, context);
        val z = AtomicAllocator.getInstance().getPointer(zArr, context);
        val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
        val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
        double zeroPad = 0.0;
        if (op.tArgs() != null && op.tArgs().length > 0) {
            zeroPad = op.tArgs()[0];
        }
        val extrass = new double[] { op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7], op.iArgs()[8], zeroPad };
        val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass);
        val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
        if (dtype == DataBuffer.Type.DOUBLE) {
            nativeOps.execTransformDouble(xShapeHost, 37, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, (DoublePointer) extraArgs);
        } else if (dtype == DataBuffer.Type.FLOAT) {
            nativeOps.execTransformFloat(xShapeHost, 37, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, (FloatPointer) extraArgs);
        } else if (dtype == DataBuffer.Type.HALF) {
            nativeOps.execTransformHalf(xShapeHost, 37, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, (ShortPointer) extraArgs);
        }
        // AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
        AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
        return;
    } else if (op.opName().equalsIgnoreCase("col2im")) {
        val dtype = Nd4j.dataType();
        val xArr = op.inputArguments()[0];
        val zArr = op.outputArguments()[0];
        CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        PointerPointer xShapeHost = // 0
        extraz.get().put(// 0
        AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 1
        context.getOldStream(), // 2
        AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
        context.getBufferAllocation(), // 4
        context.getBufferReduction(), // 5
        context.getBufferScalar(), context.getBufferSpecial(), null, AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer()));
        val x = AtomicAllocator.getInstance().getPointer(xArr, context);
        val z = AtomicAllocator.getInstance().getPointer(zArr, context);
        val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
        val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
        val extrass = new double[] { op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7] };
        val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass);
        val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
        if (dtype == DataBuffer.Type.DOUBLE) {
            nativeOps.execTransformDouble(xShapeHost, 36, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, (DoublePointer) extraArgs);
        } else if (dtype == DataBuffer.Type.FLOAT) {
            nativeOps.execTransformFloat(xShapeHost, 36, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, (FloatPointer) extraArgs);
        } else if (dtype == DataBuffer.Type.HALF) {
            nativeOps.execTransformHalf(xShapeHost, 36, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, (ShortPointer) extraArgs);
        }
        // AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
        AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
        return;
    } else if (op.opName().equalsIgnoreCase("pooling2d")) {
        val dtype = Nd4j.dataType();
        val xArr = op.inputArguments()[0];
        val zArr = op.outputArguments()[0];
        CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        PointerPointer xShapeHost = // 0
        extraz.get().put(// 0
        AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 1
        context.getOldStream(), // 2
        AtomicAllocator.getInstance().getDeviceIdPointer(), // 3
        context.getBufferAllocation(), // 4
        context.getBufferReduction(), // 5
        context.getBufferScalar(), context.getBufferSpecial(), null, AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer()));
        val x = AtomicAllocator.getInstance().getPointer(xArr, context);
        val z = AtomicAllocator.getInstance().getPointer(zArr, context);
        val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
        val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
        val extrass = new double[] { op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7], op.iArgs()[8] };
        val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass);
        val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
        if (dtype == DataBuffer.Type.DOUBLE) {
            nativeOps.execTransformDouble(xShapeHost, 71, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, (DoublePointer) extraArgs);
        } else if (dtype == DataBuffer.Type.FLOAT) {
            nativeOps.execTransformFloat(xShapeHost, 71, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, (FloatPointer) extraArgs);
        } else if (dtype == DataBuffer.Type.HALF) {
            nativeOps.execTransformHalf(xShapeHost, 71, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, (ShortPointer) extraArgs);
        }
        // AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
        AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
        return;
    }
    Nd4j.getExecutioner().commit();
    CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
    if (extraz.get() == null)
        extraz.set(new PointerPointer(32));
    PointerPointer extras = extraz.get().put(new CudaPointer(1), context.getOldStream(), context.getBufferScalar(), context.getBufferReduction());
    val outputArgs = op.outputArguments();
    val inputArgs = op.inputArguments();
    if (outputArgs.length == 0 && !op.isInplaceCall())
        throw new ND4JIllegalStateException("You can't execute non-inplace CustomOp without outputs being specified");
    val lc = op.opName().toLowerCase();
    val hash = op.opHash();
    val inputShapes = new PointerPointer<>(inputArgs.length * 2);
    val inputBuffers = new PointerPointer<>(inputArgs.length * 2);
    int cnt = 0;
    for (val in : inputArgs) {
        val hp = AtomicAllocator.getInstance().getHostPointer(in.shapeInfoDataBuffer());
        inputBuffers.put(cnt, AtomicAllocator.getInstance().getHostPointer(in));
        inputShapes.put(cnt, hp);
        val dp = AtomicAllocator.getInstance().getPointer(in.shapeInfoDataBuffer(), context);
        inputBuffers.put(cnt + inputArgs.length, AtomicAllocator.getInstance().getPointer(in, context));
        inputShapes.put(cnt + inputArgs.length, dp);
        if (op.isInplaceCall())
            AtomicAllocator.getInstance().getAllocationPoint(in).tickHostWrite();
        cnt++;
    }
    val outputShapes = new PointerPointer<>(outputArgs.length * 2);
    val outputBuffers = new PointerPointer<>(outputArgs.length * 2);
    cnt = 0;
    for (val out : outputArgs) {
        outputBuffers.put(cnt, AtomicAllocator.getInstance().getHostPointer(out));
        outputShapes.put(cnt, AtomicAllocator.getInstance().getHostPointer(out.shapeInfoDataBuffer()));
        outputBuffers.put(cnt + outputArgs.length, AtomicAllocator.getInstance().getPointer(out, context));
        outputShapes.put(cnt + outputArgs.length, AtomicAllocator.getInstance().getPointer(out.shapeInfoDataBuffer(), context));
        AtomicAllocator.getInstance().getAllocationPoint(out).tickHostWrite();
        cnt++;
    }
    if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
        val tArgs = op.tArgs().length > 0 ? new FloatPointer(op.tArgs().length) : null;
        val iArgs = op.iArgs().length > 0 ? new IntPointer(op.iArgs().length) : null;
        cnt = 0;
        for (val t : op.tArgs()) tArgs.put(cnt++, (float) t);
        cnt = 0;
        for (val i : op.iArgs()) iArgs.put(cnt++, i);
        val status = OpStatus.byNumber(nativeOps.execCustomOpFloat(extras, hash, inputBuffers, inputShapes, inputArgs.length, outputBuffers, outputShapes, outputArgs.length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, op.isInplaceCall()));
        if (status != OpStatus.ND4J_STATUS_OK)
            throw new ND4JIllegalStateException("Op execution failed: " + status);
    } else if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
        val tArgs = op.tArgs().length > 0 ? new DoublePointer(op.tArgs().length) : null;
        val iArgs = op.iArgs().length > 0 ? new IntPointer(op.iArgs().length) : null;
        cnt = 0;
        for (val t : op.tArgs()) tArgs.put(cnt++, t);
        for (val i : op.iArgs()) iArgs.put(cnt++, i);
        val status = OpStatus.byNumber(nativeOps.execCustomOpDouble(extras, hash, inputBuffers, inputShapes, inputArgs.length, outputBuffers, outputShapes, outputArgs.length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, op.isInplaceCall()));
        if (status != OpStatus.ND4J_STATUS_OK)
            throw new ND4JIllegalStateException("Op execution failed: " + status);
    } else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
        val tArgs = op.tArgs().length > 0 ? new ShortPointer(op.tArgs().length) : null;
        val iArgs = op.iArgs().length > 0 ? new IntPointer(op.iArgs().length) : null;
        cnt = 0;
        for (val t : op.tArgs()) tArgs.put(cnt++, ArrayUtil.toHalf((float) t));
        cnt = 0;
        for (val i : op.iArgs()) iArgs.put(cnt++, i);
        val status = OpStatus.byNumber(nativeOps.execCustomOpHalf(extras, hash, inputBuffers, inputShapes, inputArgs.length, outputBuffers, outputShapes, outputArgs.length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, op.isInplaceCall()));
        if (status != OpStatus.ND4J_STATUS_OK)
            throw new ND4JIllegalStateException("Op execution failed: " + status);
    }
// AtomicAllocator.getInstance().getFlowController().prepareActionAllWrite(op.outputArguments());
}
Also used : lombok.val(lombok.val) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint)

Example 30 with CudaPointer

use of org.nd4j.jita.allocator.pointers.CudaPointer in project nd4j by deeplearning4j.

the class CudaExecutioner method invoke.

protected CudaContext invoke(TransformOp op) {
    long st = profilingHookIn(op);
    checkForCompression(op);
    validateDataType(Nd4j.dataType(), op);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    if (extraz.get() == null)
        extraz.set(new PointerPointer(32));
    // Pow operations might be special
    if (op.opNum() == 7) {
        if (op.y() != null && op.y().isScalar()) {
            Nd4j.getExecutioner().commit();
            op.setY(Nd4j.valueArrayOf(op.x().shape(), op.y().getDouble(0)));
            Nd4j.getExecutioner().commit();
        }
    }
    CudaContext context = allocator.getFlowController().prepareAction(op.z(), op.x(), op.y());
    if (CudaEnvironment.getInstance().getConfiguration().isDebug())
        lastOp.set(op.opName());
    // special temp array for IsMax along dimension
    INDArray ret = null;
    Pointer x = allocator.getPointer(op.x(), context);
    Pointer xShapeInfo = allocator.getPointer(op.x().shapeInfoDataBuffer(), context);
    Pointer extraArgs = op.extraArgs() != null ? allocator.getPointer(op.extraArgsDataBuff(), context) : null;
    Pointer hostYShapeInfo = op.y() == null ? null : AddressRetriever.retrieveHostPointer(op.y().shapeInfoDataBuffer());
    Pointer hostZShapeInfo = op.z() == null ? null : AddressRetriever.retrieveHostPointer(op.z().shapeInfoDataBuffer());
    Pointer dimensionDevPointer = null;
    Pointer dimensionHostPointer = null;
    Pointer retPointer = null;
    int[] dimension = null;
    if (op.opNum() == 41 && op.extraArgs() != null) {
        // for IsMax along dimension we need special temporary buffer
        dimension = new int[(int) op.extraArgs()[0]];
        for (int i = 0; i < dimension.length; i++) {
            dimension[i] = (int) op.extraArgs()[i + 1];
        }
        for (int i = 0; i < dimension.length; i++) {
            if (dimension[i] < 0)
                dimension[i] += op.x().rank();
        }
        // do op along all dimensions
        if (dimension.length == op.x().rank())
            dimension = new int[] { Integer.MAX_VALUE };
        int[] retShape = Shape.wholeArrayDimension(dimension) ? new int[] { 1, 1 } : ArrayUtil.removeIndex(op.x().shape(), dimension);
        // ensure vector is proper shape
        if (retShape.length == 1) {
            if (dimension[0] == 0)
                retShape = new int[] { 1, retShape[0] };
            else
                retShape = new int[] { retShape[0], 1 };
        } else if (retShape.length == 0) {
            retShape = new int[] { 1, 1 };
        }
        ret = Nd4j.zeros(retShape);
        // FIXME: this maybe misleading use of this particular pointer
        hostYShapeInfo = allocator.getPointer(ret.shapeInfoDataBuffer(), context);
        // dimensionPointer = AtomicAllocator.getInstance().getPointer(Nd4j.createBuffer(dimension), context);
        DataBuffer dimensionBuffer = allocator.getConstantBuffer(dimension);
        dimensionDevPointer = allocator.getPointer(dimensionBuffer, context);
        dimensionHostPointer = allocator.getHostPointer(dimensionBuffer);
        retPointer = allocator.getPointer(ret, context);
    }
    Pointer hostTadShapeInfo = null;
    Pointer devTadShapeInfo = null;
    Pointer hostMaxTadShapeInfo = null;
    Pointer devMaxTadShapeInfo = null;
    Pair<DataBuffer, DataBuffer> tadBuffers;
    Pair<DataBuffer, DataBuffer> tadMaxBuffers;
    Pointer devTadOffsets = null;
    Pointer devMaxTadOffsets = null;
    if (op.opNum() >= 38 && op.opNum() <= 41) {
        if (op.opNum() != 41) {
            tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), new int[] { 0 });
            tadMaxBuffers = tadManager.getTADOnlyShapeInfo(op.x(), new int[] { 1 });
            hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
            devTadShapeInfo = allocator.getPointer(tadBuffers.getFirst(), context);
            hostMaxTadShapeInfo = AddressRetriever.retrieveHostPointer(tadMaxBuffers.getFirst());
            devMaxTadShapeInfo = allocator.getPointer(tadMaxBuffers.getFirst(), context);
            DataBuffer offsets = tadBuffers.getSecond();
            devTadOffsets = offsets == null ? null : allocator.getPointer(offsets, context);
            DataBuffer maxOffsets = tadMaxBuffers.getSecond();
            devMaxTadOffsets = maxOffsets == null ? null : allocator.getPointer(maxOffsets, context);
        } else {
            tadBuffers = tadManager.getTADOnlyShapeInfo(op.z(), dimension);
            hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
            devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
            DataBuffer offsets = tadBuffers.getSecond();
            devTadOffsets = offsets == null ? null : allocator.getPointer(offsets, context);
        }
    }
    Pointer z = allocator.getPointer(op.z(), context);
    Pointer zShapeInfo = allocator.getPointer(op.z().shapeInfoDataBuffer(), context);
    PointerPointer xShapeInfoHostPointer = // 0
    extraz.get().put(// 0
    AddressRetriever.retrieveHostPointer(op.x().shapeInfoDataBuffer()), // 1
    context.getOldStream(), // 2
    allocator.getDeviceIdPointer(), // 3
    context.getBufferAllocation(), // 4
    context.getBufferReduction(), // 5
    context.getBufferScalar(), // 6
    context.getBufferSpecial(), // 7
    hostYShapeInfo, // 8
    hostZShapeInfo, // 9
    hostTadShapeInfo, // 10
    devTadShapeInfo, // 11
    devTadOffsets, // 12
    hostMaxTadShapeInfo, // 13
    devMaxTadShapeInfo, // 14
    devMaxTadOffsets, // special pointer for IsMax  // 15
    dimensionDevPointer, // special pointer for IsMax  // 16
    dimensionHostPointer, // special pointer for IsMax // 17
    retPointer, new CudaPointer(dimension == null ? 0 : dimension.length));
    if (op.y() != null) {
        Pointer y = allocator.getPointer(op.y(), context);
        Pointer yShapeInfo = allocator.getPointer(op.y().shapeInfoDataBuffer(), context);
        int xEWS = op.x().elementWiseStride();
        int yEWS = op.y().elementWiseStride();
        int zEWS = op.z().elementWiseStride();
        boolean xRow = op.x().isRowVector();
        boolean yRow = op.y().isRowVector();
        boolean zRow = op.z().isRowVector();
        if (op.x().data().dataType() == DataBuffer.Type.DOUBLE) {
            if ((xEWS >= 1 && yEWS >= 1 && zEWS >= 1 && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
                nativeOps.execPairwiseTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, xEWS, (DoublePointer) y, yEWS, (DoublePointer) z, zEWS, (DoublePointer) extraArgs, op.n());
            } else {
                nativeOps.execPairwiseTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, (IntPointer) xShapeInfo, (DoublePointer) y, (IntPointer) yShapeInfo, (DoublePointer) z, (IntPointer) zShapeInfo, (DoublePointer) extraArgs);
            }
        } else if (op.x().data().dataType() == DataBuffer.Type.FLOAT) {
            if ((xEWS >= 1 && yEWS >= 1 && xEWS == yEWS && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
                nativeOps.execPairwiseTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, xEWS, (FloatPointer) y, yEWS, (FloatPointer) z, zEWS, (FloatPointer) extraArgs, op.n());
            } else {
                nativeOps.execPairwiseTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, (IntPointer) xShapeInfo, (FloatPointer) y, (IntPointer) yShapeInfo, (FloatPointer) z, (IntPointer) zShapeInfo, (FloatPointer) extraArgs);
            }
        } else {
            if ((xEWS >= 1 && yEWS >= 1 && xEWS == op.y().elementWiseStride() && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
                nativeOps.execPairwiseTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, xEWS, (ShortPointer) y, yEWS, (ShortPointer) z, zEWS, (ShortPointer) extraArgs, op.n());
            } else {
                nativeOps.execPairwiseTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, (IntPointer) xShapeInfo, (ShortPointer) y, (IntPointer) yShapeInfo, (ShortPointer) z, (IntPointer) zShapeInfo, (ShortPointer) extraArgs);
            }
        }
    } else {
        if (op.x().data().dataType() == DataBuffer.Type.DOUBLE) {
            if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
                nativeOps.execTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, op.x().elementWiseStride(), (DoublePointer) z, op.z().elementWiseStride(), (DoublePointer) extraArgs, op.n());
            } else {
                nativeOps.execTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, (IntPointer) xShapeInfo, (DoublePointer) z, (IntPointer) zShapeInfo, (DoublePointer) extraArgs);
            }
        } else if (op.x().data().dataType() == DataBuffer.Type.FLOAT) {
            if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
                nativeOps.execTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, op.x().elementWiseStride(), (FloatPointer) z, op.z().elementWiseStride(), (FloatPointer) extraArgs, op.n());
            } else {
                nativeOps.execTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, (IntPointer) xShapeInfo, (FloatPointer) z, (IntPointer) zShapeInfo, (FloatPointer) extraArgs);
            }
        } else {
            if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
                nativeOps.execTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, op.x().elementWiseStride(), (ShortPointer) z, op.z().elementWiseStride(), (ShortPointer) extraArgs, op.n());
            } else {
                nativeOps.execTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, (IntPointer) xShapeInfo, (ShortPointer) z, (IntPointer) zShapeInfo, (ShortPointer) extraArgs);
            }
        }
    }
    AtomicAllocator.getInstance().registerAction(context, op.z(), op.x(), op.y());
    if (extraArgs != null)
        extraArgs.address();
    if (ret != null)
        ret.elementWiseStride();
    profilingHookOut(op, st);
    return null;
}
Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) PagedPointer(org.nd4j.linalg.api.memory.pointers.PagedPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) BaseDataBuffer(org.nd4j.linalg.api.buffer.BaseDataBuffer)

Aggregations

CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)47 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)27 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)20 Pointer (org.bytedeco.javacpp.Pointer)18 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)18 INDArray (org.nd4j.linalg.api.ndarray.INDArray)15 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)12 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)11 DoublePointer (org.bytedeco.javacpp.DoublePointer)10 FloatPointer (org.bytedeco.javacpp.FloatPointer)10 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 BlasException (org.nd4j.linalg.api.blas.BlasException)8 BaseCudaDataBuffer (org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer)7 AllocationShape (org.nd4j.jita.allocator.impl.AllocationShape)4 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)4 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)4 INDArrayIndex (org.nd4j.linalg.indexing.INDArrayIndex)4