Search in sources :

Example 91 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCUDA method dgeam.

/**
 * Performs sparse and dense dgeam given two input matrices
 * C = alpha* op( A ) + beta* op ( B )
 * where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
 * To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
 * @param ec execution context
 * @param gCtx a valid {@link GPUContext}
 * @param instName the invoking instruction's name for record {@link Statistics}.
 * @param in1 left input matrix
 * @param in2 right input matrix
 * @param outputName output variable name
 * @param isLeftTransposed true if left matrix is transposed
 * @param isRightTransposed true if right matrix is transposed
 * @param alpha alpha
 * @param beta beta
 */
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    if (LOG.isTraceEnabled()) {
        LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
    }
    Pointer alphaPtr = dataTypePointerTo(alpha);
    Pointer betaPtr = dataTypePointerTo(beta);
    int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
    int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
    long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
    long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
    MatrixObject out = ec.getMatrixObject(outputName);
    boolean isSparse1 = isInSparseFormat(gCtx, in1);
    boolean isSparse2 = isInSparseFormat(gCtx, in2);
    long t0 = 0, t1 = 0;
    // TODO: Implement sparse-dense matrix cublasDgeam kernel
    if (isSparse1 || isSparse2) {
        int m = (int) in1.getNumRows();
        int n = (int) in1.getNumColumns();
        // Perform sparse-sparse dgeam
        if (!isInSparseFormat(gCtx, in1)) {
            if (DMLScript.FINEGRAINED_STATISTICS)
                t0 = System.nanoTime();
            in1.getGPUObject(gCtx).denseToSparse();
            if (DMLScript.FINEGRAINED_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
        }
        CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        if (!isInSparseFormat(gCtx, in2)) {
            if (DMLScript.FINEGRAINED_STATISTICS)
                t0 = System.nanoTime();
            in2.getGPUObject(gCtx).denseToSparse();
            if (DMLScript.FINEGRAINED_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
        }
        CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
        if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
            // Special case for transpose
            int nnz = (int) A.nnz;
            CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
            out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
            cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
        } else {
            // TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
            if (isLeftTransposed || isRightTransposed) {
                throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
            }
            if (DMLScript.FINEGRAINED_STATISTICS)
                t1 = System.nanoTime();
            CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
            if (DMLScript.FINEGRAINED_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
            out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
            // long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
            if (DMLScript.FINEGRAINED_STATISTICS)
                t0 = System.nanoTime();
            cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr, B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
            // cudaDeviceSynchronize;
            if (DMLScript.FINEGRAINED_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
        }
    } else {
        // Dense-Dense dgeam
        int lda = toInt(in1.getNumColumns());
        int ldb = toInt(in2.getNumColumns());
        int m = toInt(in1.getNumColumns());
        int n = toInt(in2.getNumRows());
        if (isLeftTransposed && isRightTransposed) {
            m = toInt(in1.getNumRows());
            n = toInt(in2.getNumColumns());
        } else if (isLeftTransposed) {
            m = toInt(in1.getNumRows());
        } else if (isRightTransposed) {
            n = toInt(in2.getNumColumns());
        }
        int ldc = m;
        Pointer A = getDensePointer(gCtx, in1, instName);
        Pointer B = getDensePointer(gCtx, in2, instName);
        // Allocated the dense output matrix
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
        Pointer C = getDensePointer(gCtx, out, instName);
        if (DMLScript.FINEGRAINED_STATISTICS)
            t0 = System.nanoTime();
        cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
        if (DMLScript.FINEGRAINED_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 92 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCuDNN method relu.

/**
 * Performs the relu operation on the GPU.
 * @param ec currently active {@link ExecutionContext}
 * @param gCtx   a valid {@link GPUContext}
 * @param instName the invoking instruction's name for record {@link Statistics}.
 * @param in input matrix
 * @param outputName	name of the output matrix
 */
public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    long N = in.getNumRows();
    long CHW = in.getNumColumns();
    MatrixObject output = ec.getMatrixObject(outputName);
    // Allocated the dense output matrix
    getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns());
    long t0 = 0;
    if (N * CHW >= maxNumElementsOfCuDNNTensor) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
        }
        // Invokes relu(double* A,  double* ret, int rlen, int clen)
        if (DMLScript.FINEGRAINED_STATISTICS)
            t0 = System.nanoTime();
        Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName);
        // TODO: FIXME: Add sparse kernel support for relu
        Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName);
        getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), srcData, dstData, toInt(N), toInt(CHW));
        if (DMLScript.FINEGRAINED_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
    } else {
        cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
        cudnnCreateTensorDescriptor(tensorDescriptor);
        cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
        cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
        cudnnDestroyTensorDescriptor(tensorDescriptor);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) jcuda.jcudnn.cudnnTensorDescriptor(jcuda.jcudnn.cudnnTensorDescriptor) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 93 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCuMatMult method matmult.

/**
 * Matrix multiply on GPU Examines sparsity and shapes and routes call to
 * appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
 *
 * The user is expected to call
 * ec.releaseMatrixOutputForGPUInstruction(outputName);
 *
 * @param ec
 *            Current {@link ExecutionContext} instance
 * @param gCtx
 *            a valid {@link GPUContext}
 * @param instName
 *            name of the invoking instruction to record{@link Statistics}.
 * @param left
 *            Matrix A
 * @param right
 *            Matrix B
 * @param outputName
 *            Name of the output matrix C (in code generated after LOP
 *            layer)
 * @param isLeftTransposed
 *            op for A, transposed or not
 * @param isRightTransposed
 *            op for B, tranposed or not
 * @return output of matrix multiply
 */
public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) {
    boolean isM1Sparse = isInSparseFormat(gCtx, left);
    boolean isM2Sparse = isInSparseFormat(gCtx, right);
    MatrixObject output = ec.getMatrixObject(outputName);
    long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
    long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
    CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(), right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);
    if (isM1Sparse && isM2Sparse) {
        // -------------------------------------------------------------------------------------
        // sparse-sparse matrix multiplication
        params.validate();
        int transa = cusparseOp(isLeftTransposed);
        int transb = cusparseOp(isRightTransposed);
        // Step 1: Allocate output => sparse format
        ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
        // Step 2: Get the handles to sparse/dense pointers for left, right
        // and output
        CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
        CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb, params.m, params.n, params.k);
        if (DMLScript.FINEGRAINED_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
        // Step 3: Invoke the kernel
        long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
        cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
        if (DMLScript.FINEGRAINED_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
        output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
    // -------------------------------------------------------------------------------------
    } else if (!isM1Sparse && isM2Sparse) {
        // -------------------------------------------------------------------------------------
        // dense-sparse matrix multiplication
        // Step 1: Allocate output => dense format
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
        // Step 2: Get the handles to sparse/dense pointers for left, right
        // and output
        Pointer A = getDensePointer(gCtx, left, instName);
        CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        Pointer C = getDensePointer(gCtx, output, instName);
        // Step 3: Invoke the kernel
        denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
    // -------------------------------------------------------------------------------------
    } else if (isM1Sparse && !isM2Sparse) {
        // -------------------------------------------------------------------------------------
        // sparse-dense matrix multiplication
        // Step 1: Allocate output => dense format
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
        // Step 2: Get the handles to sparse/dense pointers for left, right
        // and output
        CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        Pointer B = getDensePointer(gCtx, right, instName);
        Pointer C = getDensePointer(gCtx, output, instName);
        // Step 3: Invoke the kernel
        sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(), right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
    // -------------------------------------------------------------------------------------
    } else {
        // -------------------------------------------------------------------------------------
        // dense-dense matrix multiplication
        // Step 1: Allocate output => dense format
        getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
        // Step 2: Get the handles to sparse/dense pointers for left, right
        // and output
        Pointer A = getDensePointer(gCtx, left, instName);
        Pointer B = getDensePointer(gCtx, right, instName);
        Pointer C = getDensePointer(gCtx, output, instName);
        // Step 3: Invoke the kernel
        denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
    // -------------------------------------------------------------------------------------
    }
    return output;
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)

Example 94 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class DataPartitionMR method processPartitionInstructions.

private static void processPartitionInstructions(String shuffleInst, MatrixObject[] inputMatrices, byte[] resultIndices, MatrixObject[] outputMatrices, int numReducers, int replication, MatrixCharacteristics[] sts) {
    int i = 0;
    for (String inst : shuffleInst.split(Instruction.INSTRUCTION_DELIM)) {
        if (InstructionUtils.getOpCode(inst).equalsIgnoreCase("partition")) {
            // long begin = System.currentTimeMillis();
            String[] parts = InstructionUtils.getInstructionParts(inst);
            int input_index = Integer.parseInt(parts[1]);
            int output_index = Integer.parseInt(parts[2]);
            MatrixObject in = inputMatrices[input_index];
            MatrixObject out = outputMatrices[findResultIndex(resultIndices, output_index)];
            PDataPartitionFormat pformat = PDataPartitionFormat.valueOf(parts[3]);
            long rlen = in.getNumRows();
            long clen = in.getNumColumns();
            long brlen = in.getNumRowsPerBlock();
            long bclen = in.getNumColumnsPerBlock();
            long N = -1;
            switch(pformat) {
                case ROW_BLOCK_WISE_N:
                    {
                        long numRowBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / clen / brlen);
                        N = numRowBlocks * brlen;
                        break;
                    }
                case COLUMN_BLOCK_WISE_N:
                    {
                        long numColBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / rlen / bclen);
                        N = numColBlocks * bclen;
                        break;
                    }
                default:
                    throw new DMLRuntimeException("Unsupported partition format for distributed cache input: " + pformat);
            }
            PartitionFormat pf = new PartitionFormat(pformat, (int) N);
            DataPartitioner dpart = new DataPartitionerRemoteMR(pf, -1, numReducers, replication, false, true);
            out = dpart.createPartitionedMatrixObject(in, out, true);
            sts[i] = out.getMatrixCharacteristics();
            i++;
        }
    }
}
Also used : PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DataPartitioner(org.apache.sysml.runtime.controlprogram.parfor.DataPartitioner) DataPartitionerRemoteMR(org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerRemoteMR) PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 95 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class MLContextTest method testOutputMatrixObjectDML.

@Test
public void testOutputMatrixObjectDML() {
    System.out.println("MLContextTest - output matrix object DML");
    String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
    MatrixObject mo = ml.execute(dml(s).out("M")).getMatrixObject("M");
    RDD<String> rddStringCSV = MLContextConversionUtil.matrixObjectToRDDStringCSV(mo);
    Iterator<String> iterator = rddStringCSV.toLocalIterator();
    Assert.assertEquals("1.0,2.0", iterator.next());
    Assert.assertEquals("3.0,4.0", iterator.next());
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Test(org.junit.Test)

Aggregations

MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)201 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)74 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)45 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)39 Data (org.apache.sysml.runtime.instructions.cp.Data)37 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)26 Pointer (jcuda.Pointer)20 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)20 IOException (java.io.IOException)17 ArrayList (java.util.ArrayList)16 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)14 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)13 CacheableData (org.apache.sysml.runtime.controlprogram.caching.CacheableData)12 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)12 Hop (org.apache.sysml.hops.Hop)11 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)11 ParForProgramBlock (org.apache.sysml.runtime.controlprogram.ParForProgramBlock)10 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)10 Path (org.apache.hadoop.fs.Path)9 LongWritable (org.apache.hadoop.io.LongWritable)9