Examples with AtomicAllocator - org.nd4j.jita.allocator.impl.AtomicAllocator

Example 6 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method accumulate.

public INDArray accumulate(INDArray target, INDArray... arrays) {
    if (arrays == null || arrays.length == 0)
        throw new RuntimeException("Input arrays are missing");
    if (arrays.length == 1)
        return target.assign(arrays[0]);
    // we do averaging on GPU only if ALL devices have p2p links
    if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) {
        Nd4j.getExecutioner().push();
        long len = target.lengthLong();
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0));
        Pointer z = AtomicAllocator.getInstance().getPointer(target, context);
        long[] xPointers = new long[arrays.length];
        for (int i = 0; i < arrays.length; i++) {
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
            xPointers[i] = point.getPointers().getDevicePointer().address();
            point.tickDeviceWrite();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
        allocator.memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
        PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len);
        } else {
            nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len);
        }
        allocator.getFlowController().registerAction(context, target, arrays);
        tempX.address();
        return target;
    } else {
        long len = target.lengthLong();
        Nd4j.getExecutioner().commit();
        CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer(arrays.length);
        PointerPointer extras = new // not used
        PointerPointer(// not used
        null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1));
        for (int i = 0; i < arrays.length; i++) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1)
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            if (arrays[i].lengthLong() != len)
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            dataPointers.put(i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else {
            nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        }
        AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        return target;
    }
}

Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer)

Example 7 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method pullRows.

/**
 * This method produces concatenated array, that consist from tensors, fetched from source array, against some dimension and specified indexes
 *
 * @param source          source tensor
 * @param sourceDimension dimension of source tensor
 * @param indexes         indexes from source array
 * @return
 */
@Override
public INDArray pullRows(INDArray source, int sourceDimension, int[] indexes, char order) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    if (indexes == null || indexes.length < 1)
        throw new IllegalStateException("Indexes can't be null or zero-length");
    int[] shape = null;
    if (sourceDimension == 1)
        shape = new int[] { indexes.length, source.shape()[sourceDimension] };
    else if (sourceDimension == 0)
        shape = new int[] { source.shape()[sourceDimension], indexes.length };
    else
        throw new UnsupportedOperationException("2D input is expected");
    INDArray ret = Nd4j.createUninitialized(shape, order);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareAction(ret, source);
    Pointer x = AtomicAllocator.getInstance().getPointer(source, context);
    Pointer xShape = AtomicAllocator.getInstance().getPointer(source.shapeInfoDataBuffer(), context);
    Pointer z = AtomicAllocator.getInstance().getPointer(ret, context);
    Pointer zShape = AtomicAllocator.getInstance().getPointer(ret.shapeInfoDataBuffer(), context);
    PointerPointer extras = new PointerPointer(AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer());
    CudaIntDataBuffer tempIndexes = new CudaIntDataBuffer(indexes.length);
    AtomicAllocator.getInstance().memcpyBlocking(tempIndexes, new IntPointer(indexes), indexes.length * 4, 0);
    Pointer pIndex = AtomicAllocator.getInstance().getPointer(tempIndexes, context);
    TADManager tadManager = Nd4j.getExecutioner().getTADManager();
    Pair<DataBuffer, DataBuffer> tadBuffers = tadManager.getTADOnlyShapeInfo(source, new int[] { sourceDimension });
    Pair<DataBuffer, DataBuffer> zTadBuffers = tadManager.getTADOnlyShapeInfo(ret, new int[] { sourceDimension });
    Pointer tadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
    Pointer zTadShapeInfo = AtomicAllocator.getInstance().getPointer(zTadBuffers.getFirst(), context);
    DataBuffer offsets = tadBuffers.getSecond();
    Pointer tadOffsets = AtomicAllocator.getInstance().getPointer(offsets, context);
    Pointer zTadOffsets = AtomicAllocator.getInstance().getPointer(zTadBuffers.getSecond(), context);
    if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.pullRowsDouble(extras, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, indexes.length, (IntPointer) pIndex, (IntPointer) tadShapeInfo, new LongPointerWrapper(tadOffsets), (IntPointer) zTadShapeInfo, new LongPointerWrapper(zTadOffsets));
    } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.pullRowsFloat(extras, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, indexes.length, (IntPointer) pIndex, (IntPointer) tadShapeInfo, new LongPointerWrapper(tadOffsets), (IntPointer) zTadShapeInfo, new LongPointerWrapper(zTadOffsets));
    } else {
        nativeOps.pullRowsHalf(extras, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, indexes.length, (IntPointer) pIndex, (IntPointer) tadShapeInfo, new LongPointerWrapper(tadOffsets), (IntPointer) zTadShapeInfo, new LongPointerWrapper(zTadOffsets));
    }
    allocator.registerAction(context, ret, source);
    return ret;
}

Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LongPointerWrapper(org.nd4j.nativeblas.LongPointerWrapper) TADManager(org.nd4j.linalg.cache.TADManager) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) CompressedDataBuffer(org.nd4j.linalg.compression.CompressedDataBuffer) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)

Example 8 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method specialConcat.

@Override
public INDArray specialConcat(int dimension, INDArray... toConcat) {
    if (toConcat.length == 1)
        return toConcat[0];
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    PointerPointer shapeInfoPointers = new PointerPointer(toConcat.length);
    PointerPointer dataPointers = new PointerPointer(toConcat.length);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
    int sumAlongDim = 0;
    int[] outputShape = ArrayUtil.copy(toConcat[0].shape());
    for (int i = 0; i < toConcat.length; i++) {
        if (toConcat[i].isCompressed())
            Nd4j.getCompressor().decompressi(toConcat[i]);
        allocator.synchronizeHostData(toConcat[i]);
        shapeInfoPointers.put(i, allocator.getHostPointer(toConcat[i].shapeInfoDataBuffer()));
        dataPointers.put(i, allocator.getHostPointer(toConcat[i].data()));
        sumAlongDim += toConcat[i].size(dimension);
        for (int j = 0; j < toConcat[i].rank(); j++) if (j != dimension && toConcat[i].size(j) != outputShape[j]) {
            throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
        }
    }
    outputShape[dimension] = sumAlongDim;
    PointerPointer dummy = new PointerPointer(new Pointer[] { null });
    INDArray ret = Nd4j.createUninitialized(outputShape, Nd4j.order());
    if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.specialConcatDouble(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (DoublePointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.specialConcatFloat(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (FloatPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else if (ret.data().dataType() == DataBuffer.Type.HALF) {
        nativeOps.specialConcatHalf(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (ShortPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else {
        throw new ND4JIllegalStateException("Unknown dataType: " + ret.data().dataType());
    }
    AllocationPoint point = allocator.getAllocationPoint(ret);
    nativeOps.memcpyAsync(point.getDevicePointer(), point.getHostPointer(), ret.lengthLong() * Nd4j.sizeOfDataType(ret.data().dataType()), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream());
    context.getSpecialStream().synchronize();
    point.tickHostRead();
    point.tickDeviceWrite();
    return ret;
}

Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException)

Example 9 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method shuffle.

/**
 * Symmetric in place shuffle of an ndarray
 * along a specified set of dimensions. Each array in list should have it's own dimension at the same index of dimensions array
 *
 * @param arrays      the ndarrays to shuffle
 * @param dimensions the dimensions to do the shuffle
 * @return
 */
@Override
public void shuffle(List<INDArray> arrays, Random rnd, List<int[]> dimensions) {
    // no dimension - no shuffle
    if (dimensions == null || dimensions.size() == 0)
        throw new RuntimeException("Dimension can't be null or 0-length");
    if (arrays == null || arrays.size() == 0)
        throw new RuntimeException("No input arrays provided");
    if (dimensions.size() > 1 && arrays.size() != dimensions.size())
        throw new IllegalStateException("Number of dimensions do not match number of arrays to shuffle");
    Nd4j.getExecutioner().push();
    // first we build TAD for input array and dimensions
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = null;
    for (int x = 0; x < arrays.size(); x++) {
        context = allocator.getFlowController().prepareAction(arrays.get(x));
    }
    int tadLength = 1;
    for (int i = 0; i < dimensions.get(0).length; i++) {
        tadLength *= arrays.get(0).shape()[dimensions.get(0)[i]];
    }
    int numTads = arrays.get(0).length() / tadLength;
    int[] map = ArrayUtil.buildInterleavedVector(rnd, numTads);
    CudaIntDataBuffer shuffle = new CudaIntDataBuffer(map);
    Pointer shuffleMap = allocator.getPointer(shuffle, context);
    PointerPointer extras = new // not used
    PointerPointer(// not used
    null, context.getOldStream(), allocator.getDeviceIdPointer());
    long[] xPointers = new long[arrays.size()];
    long[] xShapes = new long[arrays.size()];
    long[] tadShapes = new long[arrays.size()];
    long[] tadOffsets = new long[arrays.size()];
    for (int i = 0; i < arrays.size(); i++) {
        INDArray array = arrays.get(i);
        Pointer x = AtomicAllocator.getInstance().getPointer(array, context);
        Pointer xShapeInfo = AtomicAllocator.getInstance().getPointer(array.shapeInfoDataBuffer(), context);
        TADManager tadManager = Nd4j.getExecutioner().getTADManager();
        int[] dimension = dimensions.size() > 1 ? dimensions.get(i) : dimensions.get(0);
        Pair<DataBuffer, DataBuffer> tadBuffers = tadManager.getTADOnlyShapeInfo(array, dimension);
        // log.info("Original shape: {}; dimension: {}; TAD shape: {}", array.shapeInfoDataBuffer().asInt(), dimension, tadBuffers.getFirst().asInt());
        Pointer tadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
        DataBuffer offsets = tadBuffers.getSecond();
        if (offsets.length() != numTads)
            throw new ND4JIllegalStateException("Can't symmetrically shuffle arrays with non-equal number of TADs");
        Pointer tadOffset = AtomicAllocator.getInstance().getPointer(offsets, context);
        xPointers[i] = x.address();
        xShapes[i] = xShapeInfo.address();
        tadShapes[i] = tadShapeInfo.address();
        tadOffsets[i] = tadOffset.address();
    }
    CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.size());
    CudaDoubleDataBuffer tempShapes = new CudaDoubleDataBuffer(arrays.size());
    CudaDoubleDataBuffer tempTAD = new CudaDoubleDataBuffer(arrays.size());
    CudaDoubleDataBuffer tempOffsets = new CudaDoubleDataBuffer(arrays.size());
    AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(xPointers), xPointers.length * 8, 0);
    AtomicAllocator.getInstance().memcpyBlocking(tempShapes, new LongPointer(xShapes), xPointers.length * 8, 0);
    AtomicAllocator.getInstance().memcpyBlocking(tempTAD, new LongPointer(tadShapes), xPointers.length * 8, 0);
    AtomicAllocator.getInstance().memcpyBlocking(tempOffsets, new LongPointer(tadOffsets), xPointers.length * 8, 0);
    if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.shuffleDouble(extras, new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), arrays.size(), (IntPointer) shuffleMap, new PointerPointer(allocator.getPointer(tempTAD, context)), new PointerPointer(allocator.getPointer(tempOffsets, context)));
    } else if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.shuffleFloat(extras, new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), arrays.size(), (IntPointer) shuffleMap, new PointerPointer(allocator.getPointer(tempTAD, context)), new PointerPointer(allocator.getPointer(tempOffsets, context)));
    } else {
        // HALFs
        nativeOps.shuffleHalf(extras, new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), arrays.size(), (IntPointer) shuffleMap, new PointerPointer(allocator.getPointer(tempTAD, context)), new PointerPointer(allocator.getPointer(tempOffsets, context)));
    }
    for (int f = 0; f < arrays.size(); f++) {
        allocator.getFlowController().registerAction(context, arrays.get(f));
    }
    // just to keep reference
    shuffle.address();
    tempX.dataType();
    tempShapes.dataType();
    tempOffsets.dataType();
    tempTAD.dataType();
}

Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) TADManager(org.nd4j.linalg.cache.TADManager) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) CompressedDataBuffer(org.nd4j.linalg.compression.CompressedDataBuffer) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)

Example 10 with AtomicAllocator

use of org.nd4j.jita.allocator.impl.AtomicAllocator in project nd4j by deeplearning4j.

the class CudaExecutioner method invoke.

protected CudaContext invoke(TransformOp op) {
    long st = profilingHookIn(op);
    checkForCompression(op);
    validateDataType(Nd4j.dataType(), op);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    if (extraz.get() == null)
        extraz.set(new PointerPointer(32));
    // Pow operations might be special
    if (op.opNum() == 7) {
        if (op.y() != null && op.y().isScalar()) {
            Nd4j.getExecutioner().commit();
            op.setY(Nd4j.valueArrayOf(op.x().shape(), op.y().getDouble(0)));
            Nd4j.getExecutioner().commit();
        }
    }
    CudaContext context = allocator.getFlowController().prepareAction(op.z(), op.x(), op.y());
    if (CudaEnvironment.getInstance().getConfiguration().isDebug())
        lastOp.set(op.opName());
    // special temp array for IsMax along dimension
    INDArray ret = null;
    Pointer x = allocator.getPointer(op.x(), context);
    Pointer xShapeInfo = allocator.getPointer(op.x().shapeInfoDataBuffer(), context);
    Pointer extraArgs = op.extraArgs() != null ? allocator.getPointer(op.extraArgsDataBuff(), context) : null;
    Pointer hostYShapeInfo = op.y() == null ? null : AddressRetriever.retrieveHostPointer(op.y().shapeInfoDataBuffer());
    Pointer hostZShapeInfo = op.z() == null ? null : AddressRetriever.retrieveHostPointer(op.z().shapeInfoDataBuffer());
    Pointer dimensionDevPointer = null;
    Pointer dimensionHostPointer = null;
    Pointer retPointer = null;
    int[] dimension = null;
    if (op.opNum() == 41 && op.extraArgs() != null) {
        // for IsMax along dimension we need special temporary buffer
        dimension = new int[(int) op.extraArgs()[0]];
        for (int i = 0; i < dimension.length; i++) {
            dimension[i] = (int) op.extraArgs()[i + 1];
        }
        for (int i = 0; i < dimension.length; i++) {
            if (dimension[i] < 0)
                dimension[i] += op.x().rank();
        }
        // do op along all dimensions
        if (dimension.length == op.x().rank())
            dimension = new int[] { Integer.MAX_VALUE };
        int[] retShape = Shape.wholeArrayDimension(dimension) ? new int[] { 1, 1 } : ArrayUtil.removeIndex(op.x().shape(), dimension);
        // ensure vector is proper shape
        if (retShape.length == 1) {
            if (dimension[0] == 0)
                retShape = new int[] { 1, retShape[0] };
            else
                retShape = new int[] { retShape[0], 1 };
        } else if (retShape.length == 0) {
            retShape = new int[] { 1, 1 };
        }
        ret = Nd4j.zeros(retShape);
        // FIXME: this maybe misleading use of this particular pointer
        hostYShapeInfo = allocator.getPointer(ret.shapeInfoDataBuffer(), context);
        // dimensionPointer = AtomicAllocator.getInstance().getPointer(Nd4j.createBuffer(dimension), context);
        DataBuffer dimensionBuffer = allocator.getConstantBuffer(dimension);
        dimensionDevPointer = allocator.getPointer(dimensionBuffer, context);
        dimensionHostPointer = allocator.getHostPointer(dimensionBuffer);
        retPointer = allocator.getPointer(ret, context);
    }
    Pointer hostTadShapeInfo = null;
    Pointer devTadShapeInfo = null;
    Pointer hostMaxTadShapeInfo = null;
    Pointer devMaxTadShapeInfo = null;
    Pair<DataBuffer, DataBuffer> tadBuffers;
    Pair<DataBuffer, DataBuffer> tadMaxBuffers;
    Pointer devTadOffsets = null;
    Pointer devMaxTadOffsets = null;
    if (op.opNum() >= 38 && op.opNum() <= 41) {
        if (op.opNum() != 41) {
            tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), new int[] { 0 });
            tadMaxBuffers = tadManager.getTADOnlyShapeInfo(op.x(), new int[] { 1 });
            hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
            devTadShapeInfo = allocator.getPointer(tadBuffers.getFirst(), context);
            hostMaxTadShapeInfo = AddressRetriever.retrieveHostPointer(tadMaxBuffers.getFirst());
            devMaxTadShapeInfo = allocator.getPointer(tadMaxBuffers.getFirst(), context);
            DataBuffer offsets = tadBuffers.getSecond();
            devTadOffsets = offsets == null ? null : allocator.getPointer(offsets, context);
            DataBuffer maxOffsets = tadMaxBuffers.getSecond();
            devMaxTadOffsets = maxOffsets == null ? null : allocator.getPointer(maxOffsets, context);
        } else {
            tadBuffers = tadManager.getTADOnlyShapeInfo(op.z(), dimension);
            hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
            devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
            DataBuffer offsets = tadBuffers.getSecond();
            devTadOffsets = offsets == null ? null : allocator.getPointer(offsets, context);
        }
    }
    Pointer z = allocator.getPointer(op.z(), context);
    Pointer zShapeInfo = allocator.getPointer(op.z().shapeInfoDataBuffer(), context);
    PointerPointer xShapeInfoHostPointer = // 0
    extraz.get().put(// 0
    AddressRetriever.retrieveHostPointer(op.x().shapeInfoDataBuffer()), // 1
    context.getOldStream(), // 2
    allocator.getDeviceIdPointer(), // 3
    context.getBufferAllocation(), // 4
    context.getBufferReduction(), // 5
    context.getBufferScalar(), // 6
    context.getBufferSpecial(), // 7
    hostYShapeInfo, // 8
    hostZShapeInfo, // 9
    hostTadShapeInfo, // 10
    devTadShapeInfo, // 11
    devTadOffsets, // 12
    hostMaxTadShapeInfo, // 13
    devMaxTadShapeInfo, // 14
    devMaxTadOffsets, // special pointer for IsMax  // 15
    dimensionDevPointer, // special pointer for IsMax  // 16
    dimensionHostPointer, // special pointer for IsMax // 17
    retPointer, new CudaPointer(dimension == null ? 0 : dimension.length));
    if (op.y() != null) {
        Pointer y = allocator.getPointer(op.y(), context);
        Pointer yShapeInfo = allocator.getPointer(op.y().shapeInfoDataBuffer(), context);
        int xEWS = op.x().elementWiseStride();
        int yEWS = op.y().elementWiseStride();
        int zEWS = op.z().elementWiseStride();
        boolean xRow = op.x().isRowVector();
        boolean yRow = op.y().isRowVector();
        boolean zRow = op.z().isRowVector();
        if (op.x().data().dataType() == DataBuffer.Type.DOUBLE) {
            if ((xEWS >= 1 && yEWS >= 1 && zEWS >= 1 && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
                nativeOps.execPairwiseTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, xEWS, (DoublePointer) y, yEWS, (DoublePointer) z, zEWS, (DoublePointer) extraArgs, op.n());
            } else {
                nativeOps.execPairwiseTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, (IntPointer) xShapeInfo, (DoublePointer) y, (IntPointer) yShapeInfo, (DoublePointer) z, (IntPointer) zShapeInfo, (DoublePointer) extraArgs);
            }
        } else if (op.x().data().dataType() == DataBuffer.Type.FLOAT) {
            if ((xEWS >= 1 && yEWS >= 1 && xEWS == yEWS && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
                nativeOps.execPairwiseTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, xEWS, (FloatPointer) y, yEWS, (FloatPointer) z, zEWS, (FloatPointer) extraArgs, op.n());
            } else {
                nativeOps.execPairwiseTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, (IntPointer) xShapeInfo, (FloatPointer) y, (IntPointer) yShapeInfo, (FloatPointer) z, (IntPointer) zShapeInfo, (FloatPointer) extraArgs);
            }
        } else {
            if ((xEWS >= 1 && yEWS >= 1 && xEWS == op.y().elementWiseStride() && !op.isExecSpecial() && op.x().ordering() == op.y().ordering() && op.x().ordering() == op.z().ordering()) || (xEWS >= 1 && yEWS == xEWS && zEWS == xEWS && xRow && yRow && zRow)) {
                nativeOps.execPairwiseTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, xEWS, (ShortPointer) y, yEWS, (ShortPointer) z, zEWS, (ShortPointer) extraArgs, op.n());
            } else {
                nativeOps.execPairwiseTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, (IntPointer) xShapeInfo, (ShortPointer) y, (IntPointer) yShapeInfo, (ShortPointer) z, (IntPointer) zShapeInfo, (ShortPointer) extraArgs);
            }
        }
    } else {
        if (op.x().data().dataType() == DataBuffer.Type.DOUBLE) {
            if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
                nativeOps.execTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, op.x().elementWiseStride(), (DoublePointer) z, op.z().elementWiseStride(), (DoublePointer) extraArgs, op.n());
            } else {
                nativeOps.execTransformDouble(xShapeInfoHostPointer, op.opNum(), (DoublePointer) x, (IntPointer) xShapeInfo, (DoublePointer) z, (IntPointer) zShapeInfo, (DoublePointer) extraArgs);
            }
        } else if (op.x().data().dataType() == DataBuffer.Type.FLOAT) {
            if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
                nativeOps.execTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, op.x().elementWiseStride(), (FloatPointer) z, op.z().elementWiseStride(), (FloatPointer) extraArgs, op.n());
            } else {
                nativeOps.execTransformFloat(xShapeInfoHostPointer, op.opNum(), (FloatPointer) x, (IntPointer) xShapeInfo, (FloatPointer) z, (IntPointer) zShapeInfo, (FloatPointer) extraArgs);
            }
        } else {
            if (op.x().elementWiseStride() >= 1 && !op.isExecSpecial() && op.z().ordering() == op.x().ordering()) {
                nativeOps.execTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, op.x().elementWiseStride(), (ShortPointer) z, op.z().elementWiseStride(), (ShortPointer) extraArgs, op.n());
            } else {
                nativeOps.execTransformHalf(xShapeInfoHostPointer, op.opNum(), (ShortPointer) x, (IntPointer) xShapeInfo, (ShortPointer) z, (IntPointer) zShapeInfo, (ShortPointer) extraArgs);
            }
        }
    }
    AtomicAllocator.getInstance().registerAction(context, op.z(), op.x(), op.y());
    if (extraArgs != null)
        extraArgs.address();
    if (ret != null)
        ret.elementWiseStride();
    profilingHookOut(op, st);
    return null;
}

Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) PagedPointer(org.nd4j.linalg.api.memory.pointers.PagedPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) BaseDataBuffer(org.nd4j.linalg.api.buffer.BaseDataBuffer)

Aggregations

AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)14 AllocationPoint (org.nd4j.jita.allocator.impl.AllocationPoint)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)11 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)10 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)8 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)6 ND4JIllegalStateException (org.nd4j.linalg.exception.ND4JIllegalStateException)5 CudaDoubleDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)5 GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)4 Test (org.junit.Test)3 TADManager (org.nd4j.linalg.cache.TADManager)3 CompressedDataBuffer (org.nd4j.linalg.compression.CompressedDataBuffer)3 CudaIntDataBuffer (org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer)3 ArrayList (java.util.ArrayList)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 PointersPair (org.nd4j.jita.allocator.pointers.PointersPair)1 BaseDataBuffer (org.nd4j.linalg.api.buffer.BaseDataBuffer)1 PagedPointer (org.nd4j.linalg.api.memory.pointers.PagedPointer)1 GridPointers (org.nd4j.linalg.api.ops.grid.GridPointers)1 Pair (org.nd4j.linalg.primitives.Pair)1