Examples with GridExecutioner - org.nd4j.linalg.api.ops.executioner.GridExecutioner

Example 36 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method pullRows.

/**
 * This method produces concatenated array, that consist from tensors, fetched from source array, against some dimension and specified indexes
 *
 * @param source          source tensor
 * @param sourceDimension dimension of source tensor
 * @param indexes         indexes from source array
 * @return
 */
@Override
public INDArray pullRows(INDArray source, int sourceDimension, int[] indexes, char order) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    if (indexes == null || indexes.length < 1)
        throw new IllegalStateException("Indexes can't be null or zero-length");
    int[] shape = null;
    if (sourceDimension == 1)
        shape = new int[] { indexes.length, source.shape()[sourceDimension] };
    else if (sourceDimension == 0)
        shape = new int[] { source.shape()[sourceDimension], indexes.length };
    else
        throw new UnsupportedOperationException("2D input is expected");
    INDArray ret = Nd4j.createUninitialized(shape, order);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = allocator.getFlowController().prepareAction(ret, source);
    Pointer x = AtomicAllocator.getInstance().getPointer(source, context);
    Pointer xShape = AtomicAllocator.getInstance().getPointer(source.shapeInfoDataBuffer(), context);
    Pointer z = AtomicAllocator.getInstance().getPointer(ret, context);
    Pointer zShape = AtomicAllocator.getInstance().getPointer(ret.shapeInfoDataBuffer(), context);
    PointerPointer extras = new PointerPointer(AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer());
    CudaIntDataBuffer tempIndexes = new CudaIntDataBuffer(indexes.length);
    AtomicAllocator.getInstance().memcpyBlocking(tempIndexes, new IntPointer(indexes), indexes.length * 4, 0);
    Pointer pIndex = AtomicAllocator.getInstance().getPointer(tempIndexes, context);
    TADManager tadManager = Nd4j.getExecutioner().getTADManager();
    Pair<DataBuffer, DataBuffer> tadBuffers = tadManager.getTADOnlyShapeInfo(source, new int[] { sourceDimension });
    Pair<DataBuffer, DataBuffer> zTadBuffers = tadManager.getTADOnlyShapeInfo(ret, new int[] { sourceDimension });
    Pointer tadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
    Pointer zTadShapeInfo = AtomicAllocator.getInstance().getPointer(zTadBuffers.getFirst(), context);
    DataBuffer offsets = tadBuffers.getSecond();
    Pointer tadOffsets = AtomicAllocator.getInstance().getPointer(offsets, context);
    Pointer zTadOffsets = AtomicAllocator.getInstance().getPointer(zTadBuffers.getSecond(), context);
    if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.pullRowsDouble(extras, (DoublePointer) x, (IntPointer) xShape, (DoublePointer) z, (IntPointer) zShape, indexes.length, (IntPointer) pIndex, (IntPointer) tadShapeInfo, new LongPointerWrapper(tadOffsets), (IntPointer) zTadShapeInfo, new LongPointerWrapper(zTadOffsets));
    } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.pullRowsFloat(extras, (FloatPointer) x, (IntPointer) xShape, (FloatPointer) z, (IntPointer) zShape, indexes.length, (IntPointer) pIndex, (IntPointer) tadShapeInfo, new LongPointerWrapper(tadOffsets), (IntPointer) zTadShapeInfo, new LongPointerWrapper(zTadOffsets));
    } else {
        nativeOps.pullRowsHalf(extras, (ShortPointer) x, (IntPointer) xShape, (ShortPointer) z, (IntPointer) zShape, indexes.length, (IntPointer) pIndex, (IntPointer) tadShapeInfo, new LongPointerWrapper(tadOffsets), (IntPointer) zTadShapeInfo, new LongPointerWrapper(zTadOffsets));
    }
    allocator.registerAction(context, ret, source);
    return ret;
}

Also used : ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) LongPointerWrapper(org.nd4j.nativeblas.LongPointerWrapper) TADManager(org.nd4j.linalg.cache.TADManager) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) CudaIntDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer) CompressedDataBuffer(org.nd4j.linalg.compression.CompressedDataBuffer) CudaDoubleDataBuffer(org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer)

Example 37 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project nd4j by deeplearning4j.

the class JCublasNDArrayFactory method specialConcat.

@Override
public INDArray specialConcat(int dimension, INDArray... toConcat) {
    if (toConcat.length == 1)
        return toConcat[0];
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    PointerPointer shapeInfoPointers = new PointerPointer(toConcat.length);
    PointerPointer dataPointers = new PointerPointer(toConcat.length);
    AtomicAllocator allocator = AtomicAllocator.getInstance();
    CudaContext context = (CudaContext) allocator.getDeviceContext().getContext();
    int sumAlongDim = 0;
    int[] outputShape = ArrayUtil.copy(toConcat[0].shape());
    for (int i = 0; i < toConcat.length; i++) {
        if (toConcat[i].isCompressed())
            Nd4j.getCompressor().decompressi(toConcat[i]);
        allocator.synchronizeHostData(toConcat[i]);
        shapeInfoPointers.put(i, allocator.getHostPointer(toConcat[i].shapeInfoDataBuffer()));
        dataPointers.put(i, allocator.getHostPointer(toConcat[i].data()));
        sumAlongDim += toConcat[i].size(dimension);
        for (int j = 0; j < toConcat[i].rank(); j++) if (j != dimension && toConcat[i].size(j) != outputShape[j]) {
            throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
        }
    }
    outputShape[dimension] = sumAlongDim;
    PointerPointer dummy = new PointerPointer(new Pointer[] { null });
    INDArray ret = Nd4j.createUninitialized(outputShape, Nd4j.order());
    if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
        nativeOps.specialConcatDouble(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (DoublePointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
        nativeOps.specialConcatFloat(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (FloatPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else if (ret.data().dataType() == DataBuffer.Type.HALF) {
        nativeOps.specialConcatHalf(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (ShortPointer) ret.data().addressPointer(), (IntPointer) ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[] { null }), new PointerPointer(new Pointer[] { null }));
    } else {
        throw new ND4JIllegalStateException("Unknown dataType: " + ret.data().dataType());
    }
    AllocationPoint point = allocator.getAllocationPoint(ret);
    nativeOps.memcpyAsync(point.getDevicePointer(), point.getHostPointer(), ret.lengthLong() * Nd4j.sizeOfDataType(ret.data().dataType()), CudaConstants.cudaMemcpyHostToDevice, context.getSpecialStream());
    context.getSpecialStream().synchronize();
    point.tickHostRead();
    point.tickDeviceWrite();
    return ret;
}

Also used : AtomicAllocator(org.nd4j.jita.allocator.impl.AtomicAllocator) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) AllocationPoint(org.nd4j.jita.allocator.impl.AllocationPoint) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException)

Example 38 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project nd4j by deeplearning4j.

the class JcublasLapack method sgeqrf.

// =========================
// Q R DECOMP
@Override
public void sgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
    INDArray a = A;
    INDArray r = R;
    if (Nd4j.dataType() != DataBuffer.Type.FLOAT)
        log.warn("FLOAT getrf called in DOUBLE environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (R != null && R.ordering() == 'c')
        r = R.dup('f');
    INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createFloat(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }).getFirst());
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new IllegalStateException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        CublasPointer xTauPointer = new CublasPointer(tau, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgeqrf_bufferSize(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual QR decomp
        stat = cusolverDnSgeqrf(solverDn, M, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgeqrf failed", stat);
        }
        allocator.registerAction(ctx, a);
        // allocator.registerAction(ctx, tau);
        allocator.registerAction(ctx, INFO);
        if (INFO.getInt(0) != 0) {
            throw new BlasException("cusolverDnSgeqrf failed on INFO", INFO.getInt(0));
        }
        // Copy R ( upper part of Q ) into result
        if (r != null) {
            r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
            INDArrayIndex[] ix = new INDArrayIndex[2];
            for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
                ix[0] = NDArrayIndex.point(i);
                ix[1] = NDArrayIndex.interval(0, i);
                r.put(ix, 0);
            }
        }
        stat = cusolverDnSorgqr_bufferSize(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
        worksize = worksizeBuffer.getInt(0);
        workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        stat = cusolverDnSorgqr(solverDn, M, N, N, (FloatPointer) xAPointer.getDevicePointer(), M, (FloatPointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asFloatPointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSorgqr failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (r != null && r != R)
        R.assign(r);
    log.info("A: {}", A);
    if (R != null)
        log.info("R: {}", R);
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) FloatPointer(org.bytedeco.javacpp.FloatPointer) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 39 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project nd4j by deeplearning4j.

the class JcublasLapack method dgeqrf.

@Override
public void dgeqrf(int M, int N, INDArray A, INDArray R, INDArray INFO) {
    INDArray a = A;
    INDArray r = R;
    if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
        log.warn("DOUBLE getrf called in FLOAT environment");
    if (A.ordering() == 'c')
        a = A.dup('f');
    if (R != null && R.ordering() == 'c')
        r = R.dup('f');
    INDArray tau = Nd4j.createArrayFromShapeBuffer(Nd4j.getDataBufferFactory().createDouble(N), Nd4j.getShapeInfoProvider().createShapeInformation(new int[] { 1, N }));
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        CublasPointer xTauPointer = new CublasPointer(tau, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnDgeqrf_bufferSize(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDgeqrf_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the permutation matrix and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        // Do the actual QR decomp
        stat = cusolverDnDgeqrf(solverDn, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDgeqrf failed", stat);
        }
        allocator.registerAction(ctx, a);
        allocator.registerAction(ctx, tau);
        allocator.registerAction(ctx, INFO);
        if (INFO.getInt(0) != 0) {
            throw new BlasException("cusolverDnDgeqrf failed with info", INFO.getInt(0));
        }
        // Copy R ( upper part of Q ) into result
        if (r != null) {
            r.assign(a.get(NDArrayIndex.interval(0, a.columns()), NDArrayIndex.all()));
            INDArrayIndex[] ix = new INDArrayIndex[2];
            for (int i = 1; i < Math.min(a.rows(), a.columns()); i++) {
                ix[0] = NDArrayIndex.point(i);
                ix[1] = NDArrayIndex.interval(0, i);
                r.put(ix, 0);
            }
        }
        stat = cusolverDnDorgqr_bufferSize(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), (IntPointer) worksizeBuffer.addressPointer());
        worksize = worksizeBuffer.getInt(0);
        workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        stat = cusolverDnDorgqr(solverDn, M, N, N, (DoublePointer) xAPointer.getDevicePointer(), M, (DoublePointer) xTauPointer.getDevicePointer(), new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDorgqr failed", stat);
        }
    }
    allocator.registerAction(ctx, a);
    allocator.registerAction(ctx, INFO);
    if (a != A)
        A.assign(a);
    if (r != null && r != R)
        R.assign(r);
    log.info("A: {}", A);
    if (R != null)
        log.info("R: {}", R);
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) IntPointer(org.bytedeco.javacpp.IntPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Example 40 with GridExecutioner

use of org.nd4j.linalg.api.ops.executioner.GridExecutioner in project nd4j by deeplearning4j.

the class JcublasLapack method dgesvd.

@Override
public void dgesvd(byte jobu, byte jobvt, int M, int N, INDArray A, INDArray S, INDArray U, INDArray VT, INDArray INFO) {
    INDArray a = A;
    INDArray u = U;
    INDArray vt = VT;
    // we should transpose & adjust outputs if M<N
    // cuda has a limitation, but it's OK we know
    // A = U S V'
    // transpose multiply rules give us ...
    // A' = V S' U'
    boolean hadToTransposeA = false;
    if (M < N) {
        hadToTransposeA = true;
        int tmp1 = N;
        N = M;
        M = tmp1;
        a = A.transpose().dup('f');
        u = VT.dup('f');
        vt = U.dup('f');
    } else {
        // cuda requires column ordering - we'll register a warning in case
        if (A.ordering() == 'c')
            a = A.dup('f');
        if (U != null && U.ordering() == 'c')
            u = U.dup('f');
        if (VT != null && VT.ordering() == 'c')
            vt = VT.dup('f');
    }
    if (Nd4j.dataType() != DataBuffer.Type.DOUBLE)
        log.warn("DOUBLE gesvd called in FLOAT environment");
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();
    // Get context for current thread
    CudaContext ctx = (CudaContext) allocator.getDeviceContext().getContext();
    // setup the solver handles for cuSolver calls
    cusolverDnHandle_t handle = ctx.getSolverHandle();
    cusolverDnContext solverDn = new cusolverDnContext(handle);
    // synchronized on the solver
    synchronized (handle) {
        int result = cusolverDnSetStream(new cusolverDnContext(handle), new CUstream_st(ctx.getOldStream()));
        if (result != 0)
            throw new BlasException("solverSetStream failed");
        // transfer the INDArray into GPU memory
        CublasPointer xAPointer = new CublasPointer(a, ctx);
        // this output - indicates how much memory we'll need for the real operation
        DataBuffer worksizeBuffer = Nd4j.getDataBufferFactory().createInt(1);
        int stat = cusolverDnSgesvd_bufferSize(// we intentionally use host pointer here
        solverDn, // we intentionally use host pointer here
        M, // we intentionally use host pointer here
        N, // we intentionally use host pointer here
        (IntPointer) worksizeBuffer.addressPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnSgesvd_bufferSize failed", stat);
        }
        int worksize = worksizeBuffer.getInt(0);
        // Now allocate memory for the workspace, the non-converging row buffer and a return code
        Pointer workspace = new Workspace(worksize * Nd4j.sizeOfDataType());
        DataBuffer rwork = Nd4j.getDataBufferFactory().createDouble((M < N ? M : N) - 1);
        // Do the actual decomp
        stat = cusolverDnDgesvd(solverDn, jobu, jobvt, M, N, (DoublePointer) xAPointer.getDevicePointer(), M, new CudaPointer(allocator.getPointer(S, ctx)).asDoublePointer(), U == null ? null : new CudaPointer(allocator.getPointer(u, ctx)).asDoublePointer(), M, VT == null ? null : new CudaPointer(allocator.getPointer(vt, ctx)).asDoublePointer(), N, new CudaPointer(workspace).asDoublePointer(), worksize, new CudaPointer(allocator.getPointer(rwork, ctx)).asDoublePointer(), new CudaPointer(allocator.getPointer(INFO, ctx)).asIntPointer());
        if (stat != CUSOLVER_STATUS_SUCCESS) {
            throw new BlasException("cusolverDnDgesvd failed" + stat);
        }
    }
    allocator.registerAction(ctx, INFO);
    allocator.registerAction(ctx, S);
    allocator.registerAction(ctx, a);
    if (U != null)
        allocator.registerAction(ctx, u);
    if (VT != null)
        allocator.registerAction(ctx, vt);
    // if we transposed A then swap & transpose U & V'
    if (hadToTransposeA) {
        U.assign(vt.transpose());
        VT.assign(u.transpose());
    } else {
        if (u != U)
            U.assign(u);
        if (vt != VT)
            VT.assign(vt);
    }
}

Also used : CUstream_st(org.bytedeco.javacpp.cuda.CUstream_st) CudaContext(org.nd4j.linalg.jcublas.context.CudaContext) DoublePointer(org.bytedeco.javacpp.DoublePointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) DoublePointer(org.bytedeco.javacpp.DoublePointer) IntPointer(org.bytedeco.javacpp.IntPointer) FloatPointer(org.bytedeco.javacpp.FloatPointer) Pointer(org.bytedeco.javacpp.Pointer) org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t(org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) BlasException(org.nd4j.linalg.api.blas.BlasException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CudaPointer(org.nd4j.jita.allocator.pointers.CudaPointer) CublasPointer(org.nd4j.linalg.jcublas.CublasPointer) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer)

Aggregations

GridExecutioner (org.nd4j.linalg.api.ops.executioner.GridExecutioner)43 INDArray (org.nd4j.linalg.api.ndarray.INDArray)37 CudaContext (org.nd4j.linalg.jcublas.context.CudaContext)23 DoublePointer (org.bytedeco.javacpp.DoublePointer)16 FloatPointer (org.bytedeco.javacpp.FloatPointer)16 Pointer (org.bytedeco.javacpp.Pointer)16 CudaPointer (org.nd4j.jita.allocator.pointers.CudaPointer)14 AtomicAllocator (org.nd4j.jita.allocator.impl.AtomicAllocator)13 DataBuffer (org.nd4j.linalg.api.buffer.DataBuffer)12 IntPointer (org.bytedeco.javacpp.IntPointer)10 CUstream_st (org.bytedeco.javacpp.cuda.CUstream_st)10 org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t (org.nd4j.jita.allocator.pointers.cuda.cusolverDnHandle_t)10 CublasPointer (org.nd4j.linalg.jcublas.CublasPointer)10 ArrayList (java.util.ArrayList)9 ComputationGraph (org.deeplearning4j.nn.graph.ComputationGraph)9 Allocator (org.nd4j.jita.allocator.Allocator)9 BlasException (org.nd4j.linalg.api.blas.BlasException)8 Tuple2 (scala.Tuple2)8 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)7 ShortPointer (org.bytedeco.javacpp.ShortPointer)6