Search in sources :

Example 11 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method unaryAggregate.

//********************************************************************/
//***************** END OF MATRIX MULTIPLY Functions *****************/
//********************************************************************/
//********************************************************************/
//****************  UNARY AGGREGATE Functions ************************/
//********************************************************************/
/**
	 * Entry point to perform Unary aggregate operations on the GPU.
	 * The execution context object is used to allocate memory for the GPU.
	 * @param ec			Instance of {@link ExecutionContext}, from which the output variable will be allocated
	 * @param gCtx    a valid {@link GPUContext}
	 * @param instName name of the invoking instruction to record{@link Statistics}.
	 * @param in1			input matrix
	 * @param output	output matrix/scalar name
	 * @param op			Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation.
	 * @throws DMLRuntimeException if {@link DMLRuntimeException} occurs
	 */
public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
    final int REDUCTION_ALL = 1;
    final int REDUCTION_ROW = 2;
    final int REDUCTION_COL = 3;
    final int REDUCTION_DIAG = 4;
    // A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered,
    // it just does regular summation reduction.
    final int OP_PLUS = 1;
    final int OP_PLUS_SQ = 2;
    final int OP_MEAN = 3;
    final int OP_VARIANCE = 4;
    final int OP_MULTIPLY = 5;
    final int OP_MAX = 6;
    final int OP_MIN = 7;
    final int OP_MAXINDEX = 8;
    final int OP_MININDEX = 9;
    // Sanity Checks
    if (!in1.getGPUObject(gCtx).isAllocated())
        throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());
    boolean isSparse = in1.getGPUObject(gCtx).isSparse();
    IndexFunction indexFn = op.indexFn;
    AggregateOperator aggOp = op.aggOp;
    // Convert Reduction direction to a number to pass to CUDA kernel
    int reductionDirection = -1;
    if (indexFn instanceof ReduceAll) {
        reductionDirection = REDUCTION_ALL;
    } else if (indexFn instanceof ReduceRow) {
        reductionDirection = REDUCTION_ROW;
    } else if (indexFn instanceof ReduceCol) {
        reductionDirection = REDUCTION_COL;
    } else if (indexFn instanceof ReduceDiag) {
        reductionDirection = REDUCTION_DIAG;
    } else {
        throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
    }
    assert reductionDirection != -1 : "Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction";
    // Convert function type to a number to pass to the CUDA Kernel
    int opIndex = -1;
    if (aggOp.increOp.fn instanceof KahanPlus) {
        opIndex = OP_PLUS;
    } else if (aggOp.increOp.fn instanceof KahanPlusSq) {
        opIndex = OP_PLUS_SQ;
    } else if (aggOp.increOp.fn instanceof Mean) {
        opIndex = OP_MEAN;
    } else if (aggOp.increOp.fn instanceof CM) {
        assert ((CM) aggOp.increOp.fn).getAggOpType() == CMOperator.AggregateOperationTypes.VARIANCE : "Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU";
        opIndex = OP_VARIANCE;
    } else if (aggOp.increOp.fn instanceof Plus) {
        opIndex = OP_PLUS;
    } else if (aggOp.increOp.fn instanceof Multiply) {
        opIndex = OP_MULTIPLY;
    } else if (aggOp.increOp.fn instanceof Builtin) {
        Builtin b = (Builtin) aggOp.increOp.fn;
        switch(b.bFunc) {
            case MAX:
                opIndex = OP_MAX;
                break;
            case MIN:
                opIndex = OP_MIN;
                break;
            case MAXINDEX:
                opIndex = OP_MAXINDEX;
                break;
            case MININDEX:
                opIndex = OP_MININDEX;
                break;
            default:
                new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
        }
    } else {
        throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
    }
    assert opIndex != -1 : "Internal Error - Incorrect type of operation set for aggregate unary GPU instruction";
    int rlen = (int) in1.getNumRows();
    int clen = (int) in1.getNumColumns();
    if (isSparse) {
        // The strategy for the time being is to convert sparse to dense
        // until a sparse specific kernel is written.
        in1.getGPUObject(gCtx).sparseToDense(instName);
    // long nnz = in1.getNnz();
    // assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU";
    // MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
    // throw new DMLRuntimeException("Internal Error - Not implemented");
    }
    Pointer out = null;
    if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) {
        // Matrix output
        MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output);
        out = getDensePointer(gCtx, out1, instName);
    }
    Pointer in = getDensePointer(gCtx, in1, instName);
    int size = rlen * clen;
    // For scalars, set the scalar output in the Execution Context object
    switch(opIndex) {
        case OP_PLUS:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
                            reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_DIAG:
                        throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
                }
                break;
            }
        case OP_PLUS_SQ:
            {
                // Calculate the squares in a temporary object tmp
                Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
                squareMatrix(gCtx, instName, in, tmp, rlen, clen);
                // Then do the sum on the temporary object and free it
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
                            reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
                }
                gCtx.cudaFreeHelper(instName, tmp);
                break;
            }
        case OP_MEAN:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
                            double mean = result / size;
                            ec.setScalarOutput(output, new DoubleObject(mean));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
                }
                break;
            }
        case OP_MULTIPLY:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
                }
                break;
            }
        case OP_MAX:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_max", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
                }
                break;
            }
        case OP_MIN:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_min", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
                }
                break;
            }
        case OP_VARIANCE:
            {
                // Temporary GPU array for
                Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
                Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
                            double mean = result / size;
                            // Subtract mean from every element in the matrix
                            ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
                            matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
                            squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                            double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
                            double variance = result2 / (size - 1);
                            ec.setScalarOutput(output, new DoubleObject(variance));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                            // Subtract the row-wise mean from every element in the matrix
                            BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                            matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
                            squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                            Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
                            reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
                            ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
                            matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
                            gCtx.cudaFreeHelper(instName, tmpRow);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                            // Subtract the columns-wise mean from every element in the matrix
                            BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                            matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
                            squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                            Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
                            reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
                            ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
                            matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
                            gCtx.cudaFreeHelper(instName, tmpCol);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
                }
                gCtx.cudaFreeHelper(instName, tmp);
                gCtx.cudaFreeHelper(instName, tmp2);
                break;
            }
        case OP_MAXINDEX:
            {
                switch(reductionDirection) {
                    case REDUCTION_COL:
                        throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
                }
            // break;
            }
        case OP_MININDEX:
            {
                switch(reductionDirection) {
                    case REDUCTION_COL:
                        throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
                }
            // break;
            }
        default:
            throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
    }
}
Also used : ReduceCol(org.apache.sysml.runtime.functionobjects.ReduceCol) ScalarOperator(org.apache.sysml.runtime.matrix.operators.ScalarOperator) LeftScalarOperator(org.apache.sysml.runtime.matrix.operators.LeftScalarOperator) RightScalarOperator(org.apache.sysml.runtime.matrix.operators.RightScalarOperator) ReduceAll(org.apache.sysml.runtime.functionobjects.ReduceAll) Mean(org.apache.sysml.runtime.functionobjects.Mean) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ReduceDiag(org.apache.sysml.runtime.functionobjects.ReduceDiag) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) CM(org.apache.sysml.runtime.functionobjects.CM) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) RightScalarOperator(org.apache.sysml.runtime.matrix.operators.RightScalarOperator) ReduceRow(org.apache.sysml.runtime.functionobjects.ReduceRow) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IndexFunction(org.apache.sysml.runtime.functionobjects.IndexFunction) Multiply(org.apache.sysml.runtime.functionobjects.Multiply) AggregateOperator(org.apache.sysml.runtime.matrix.operators.AggregateOperator) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) KahanPlusSq(org.apache.sysml.runtime.functionobjects.KahanPlusSq) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) Plus(org.apache.sysml.runtime.functionobjects.Plus) BinaryOperator(org.apache.sysml.runtime.matrix.operators.BinaryOperator) Builtin(org.apache.sysml.runtime.functionobjects.Builtin)

Example 12 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method dgeam.

/**
	 * Performs sparse and dense dgeam given two input matrices
	 * C = alpha* op( A ) + beta* op ( B )
	 * where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
	 * To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
	 * @param ec execution context
	 * @param gCtx a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param in1 left input matrix
	 * @param in2 right input matrix
	 * @param outputName output variable name
	 * @param isLeftTransposed true if left matrix is transposed
	 * @param isRightTransposed true if right matrix is transposed
	 * @param alpha alpha
	 * @param beta beta
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
    Pointer alphaPtr = pointerTo(alpha);
    Pointer betaPtr = pointerTo(beta);
    int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
    int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
    MatrixObject out = ec.getMatrixObject(outputName);
    boolean isSparse1 = isInSparseFormat(gCtx, in1);
    boolean isSparse2 = isInSparseFormat(gCtx, in2);
    long t0 = 0, t1 = 0;
    // TODO: Implement sparse-dense matrix cublasDgeam kernel
    if (isSparse1 || isSparse2) {
        int m = (int) in1.getNumRows();
        int n = (int) in1.getNumColumns();
        // Perform sparse-sparse dgeam
        if (!isInSparseFormat(gCtx, in1)) {
            if (GPUStatistics.DISPLAY_STATISTICS)
                t0 = System.nanoTime();
            in1.getGPUObject(gCtx).denseToSparse();
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
        }
        CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        if (!isInSparseFormat(gCtx, in2)) {
            if (GPUStatistics.DISPLAY_STATISTICS)
                t0 = System.nanoTime();
            in2.getGPUObject(gCtx).denseToSparse();
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
        }
        CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        ec.allocateGPUMatrixObject(outputName);
        if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
            // Special case for transpose
            int nnz = (int) A.nnz;
            CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
            out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
            cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
        } else {
            // TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
            if (isLeftTransposed || isRightTransposed) {
                throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
            }
            if (GPUStatistics.DISPLAY_STATISTICS)
                t1 = System.nanoTime();
            CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
            out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
            //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
            if (GPUStatistics.DISPLAY_STATISTICS)
                t0 = System.nanoTime();
            JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
            //cudaDeviceSynchronize;
            if (GPUStatistics.DISPLAY_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
        }
    } else {
        // Dense-Dense dgeam
        int lda = (int) in1.getNumColumns();
        int ldb = (int) in2.getNumColumns();
        int m = (int) in1.getNumColumns();
        int n = (int) in2.getNumRows();
        if (isLeftTransposed && isRightTransposed) {
            m = (int) in1.getNumRows();
            n = (int) in2.getNumColumns();
        } else if (isLeftTransposed) {
            m = (int) in1.getNumRows();
        } else if (isRightTransposed) {
            n = (int) in2.getNumColumns();
        }
        int ldc = m;
        Pointer A = getDensePointer(gCtx, in1, instName);
        Pointer B = getDensePointer(gCtx, in2, instName);
        // Allocated the dense output matrix
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
        Pointer C = getDensePointer(gCtx, out, instName);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t0 = System.nanoTime();
        JCublas2.cublasDgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 13 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method reluBackward.

/**
	 * This method computes the backpropagation errors for previous layer of relu operation
	 * @param gCtx a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param input input image
	 * @param dout  next layer error propogation
	 * @param outputBlock output
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
    LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
    long rows = input.getNumRows();
    long cols = input.getNumColumns();
    Pointer imagePointer = getDensePointer(gCtx, input, instName);
    Pointer doutPointer = getDensePointer(gCtx, dout, instName);
    Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
    long t1 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    getCudaKernels(gCtx).launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations((int) rows, (int) cols), imagePointer, doutPointer, outputPointer, (int) rows, (int) cols);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer)

Example 14 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method denseSparseMatmult.

/**
	 * C = op(A) * op(B) where A is dense and B is sparse
	 * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
	 * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
	 * @param gCtx   a valid {@link GPUContext}
	 * @param instName the invoking instruction's name for record {@link Statistics}.
	 * @param left {@link MatrixObject} of A
	 * @param right {@link MatrixObject} of B
	 * @param output {@link MatrixObject} of the output matrix C
	 * @param isLeftTransposed whether matrix A needs to be transposed
	 * @param isRightTransposed whether matrix B needs to be transposed
	 * @param m ?
	 * @param n ?
	 * @param k ?
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
    // right sparse, left dense
    CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    Pointer ADense = getDensePointer(gCtx, left, instName);
    if (B.isUltraSparse(k, n)) {
        LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
        // Convert left to CSR and do cuSparse matmul
        int rowsA = (int) left.getNumRows();
        int colsA = (int) left.getNumColumns();
        long t0 = 0, t1 = 0, t2 = 0;
        if (DMLScript.STATISTICS)
            t0 = System.nanoTime();
        Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
        sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        A.deallocate();
        gCtx.cudaFreeHelper(AT);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
    } else {
        LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
        // Convert right to dense and do a cuBlas matmul
        // BDenseTransposed is a column major matrix
        // Note the arguments to denseDenseMatmult to accommodate for this.
        long t0 = 0, t1 = 0;
        if (DMLScript.STATISTICS)
            t0 = System.nanoTime();
        Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int) right.getNumRows(), (int) right.getNumColumns());
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
        if (DMLScript.STATISTICS)
            GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
        if (GPUStatistics.DISPLAY_STATISTICS)
            t1 = System.nanoTime();
        // To allocate the dense matrix
        boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();
        if (GPUStatistics.DISPLAY_STATISTICS && allocated)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
        Pointer C = getDensePointer(gCtx, output, instName);
        denseDenseMatmult(gCtx, instName, C, (int) left.getNumRows(), (int) left.getNumColumns(), (int) right.getNumColumns(), (int) right.getNumRows(), isLeftTransposed, !isRightTransposed, ADense, BDenseTransposed);
        gCtx.cudaFreeHelper(instName, BDenseTransposed);
    }
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)

Example 15 with Pointer

use of jcuda.Pointer in project incubator-systemml by apache.

the class LibMatrixCUDA method setOutputToConstant.

/**
	 * Fills an an array on the GPU with a given scalar value
	 * @param ec					currently active instance of the {@link ExecutionContext}
	 * @param gCtx        a valid {@link GPUContext}
	 * @param instName    name of the invoking instruction to record{@link Statistics}.
	 * @param constant		scalar value with which to fill the matrix
	 * @param outputName	(internal) name of the matrix that is to be filled
	 * @throws DMLRuntimeException if error
	 */
private static void setOutputToConstant(ExecutionContext ec, GPUContext gCtx, String instName, double constant, String outputName) throws DMLRuntimeException {
    if (ec.getGPUContext() != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    if (constant == 0) {
    // TODO: Create sparse empty block instead
    }
    MatrixObject out = ec.getMatrixObject(outputName);
    // Allocated the dense output matrix
    getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
    Pointer A = getDensePointer(gCtx, out, instName);
    int rlen = (int) out.getNumRows();
    int clen = (int) out.getNumColumns();
    long t0 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    int size = rlen * clen;
    getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0);
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

Pointer (jcuda.Pointer)44 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)33 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)26 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 jcuda.jcudnn.cudnnTensorDescriptor (jcuda.jcudnn.cudnnTensorDescriptor)11 CudaException (jcuda.CudaException)6 jcuda.jcudnn.cudnnConvolutionDescriptor (jcuda.jcudnn.cudnnConvolutionDescriptor)3 jcuda.jcudnn.cudnnFilterDescriptor (jcuda.jcudnn.cudnnFilterDescriptor)3 HashMap (java.util.HashMap)2 jcuda.jcudnn.cudnnPoolingDescriptor (jcuda.jcudnn.cudnnPoolingDescriptor)2 GPUObject (org.apache.sysml.runtime.instructions.gpu.context.GPUObject)2 LeftScalarOperator (org.apache.sysml.runtime.matrix.operators.LeftScalarOperator)2 RightScalarOperator (org.apache.sysml.runtime.matrix.operators.RightScalarOperator)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 CUfunction (jcuda.driver.CUfunction)1 jcuda.jcudnn.cudnnActivationDescriptor (jcuda.jcudnn.cudnnActivationDescriptor)1 jcuda.jcusparse.cusparseMatDescr (jcuda.jcusparse.cusparseMatDescr)1 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)1 CM (org.apache.sysml.runtime.functionobjects.CM)1