use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class LibMatrixCUDA method dgeam.
/**
* Performs sparse and dense dgeam given two input matrices
* C = alpha* op( A ) + beta* op ( B )
* where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
* To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 left input matrix
* @param in2 right input matrix
* @param outputName output variable name
* @param isLeftTransposed true if left matrix is transposed
* @param isRightTransposed true if right matrix is transposed
* @param alpha alpha
* @param beta beta
*/
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if (LOG.isTraceEnabled()) {
LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
}
Pointer alphaPtr = dataTypePointerTo(alpha);
Pointer betaPtr = dataTypePointerTo(beta);
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
MatrixObject out = ec.getMatrixObject(outputName);
boolean isSparse1 = isInSparseFormat(gCtx, in1);
boolean isSparse2 = isInSparseFormat(gCtx, in2);
long t0 = 0, t1 = 0;
// TODO: Implement sparse-dense matrix cublasDgeam kernel
if (isSparse1 || isSparse2) {
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
// Perform sparse-sparse dgeam
if (!isInSparseFormat(gCtx, in1)) {
if (DMLScript.FINEGRAINED_STATISTICS)
t0 = System.nanoTime();
in1.getGPUObject(gCtx).denseToSparse();
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
}
CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
if (!isInSparseFormat(gCtx, in2)) {
if (DMLScript.FINEGRAINED_STATISTICS)
t0 = System.nanoTime();
in2.getGPUObject(gCtx).denseToSparse();
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
}
CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
// Special case for transpose
int nnz = (int) A.nnz;
CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
} else {
// TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
if (isLeftTransposed || isRightTransposed) {
throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
}
if (DMLScript.FINEGRAINED_STATISTICS)
t1 = System.nanoTime();
CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
// long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
if (DMLScript.FINEGRAINED_STATISTICS)
t0 = System.nanoTime();
cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr, B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
// cudaDeviceSynchronize;
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
}
} else {
// Dense-Dense dgeam
int lda = toInt(in1.getNumColumns());
int ldb = toInt(in2.getNumColumns());
int m = toInt(in1.getNumColumns());
int n = toInt(in2.getNumRows());
if (isLeftTransposed && isRightTransposed) {
m = toInt(in1.getNumRows());
n = toInt(in2.getNumColumns());
} else if (isLeftTransposed) {
m = toInt(in1.getNumRows());
} else if (isRightTransposed) {
n = toInt(in2.getNumColumns());
}
int ldc = m;
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
Pointer C = getDensePointer(gCtx, out, instName);
if (DMLScript.FINEGRAINED_STATISTICS)
t0 = System.nanoTime();
cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
}
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class LibMatrixCuDNN method relu.
/**
* Performs the relu operation on the GPU.
* @param ec currently active {@link ExecutionContext}
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName name of the output matrix
*/
public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
long N = in.getNumRows();
long CHW = in.getNumColumns();
MatrixObject output = ec.getMatrixObject(outputName);
// Allocated the dense output matrix
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns());
long t0 = 0;
if (N * CHW >= maxNumElementsOfCuDNNTensor) {
if (LOG.isTraceEnabled()) {
LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
}
// Invokes relu(double* A, double* ret, int rlen, int clen)
if (DMLScript.FINEGRAINED_STATISTICS)
t0 = System.nanoTime();
Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName);
// TODO: FIXME: Add sparse kernel support for relu
Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName);
getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), srcData, dstData, toInt(N), toInt(CHW));
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
} else {
cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
cudnnCreateTensorDescriptor(tensorDescriptor);
cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
cudnnDestroyTensorDescriptor(tensorDescriptor);
}
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class LibMatrixCuMatMult method matmult.
/**
* Matrix multiply on GPU Examines sparsity and shapes and routes call to
* appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
*
* The user is expected to call
* ec.releaseMatrixOutputForGPUInstruction(outputName);
*
* @param ec
* Current {@link ExecutionContext} instance
* @param gCtx
* a valid {@link GPUContext}
* @param instName
* name of the invoking instruction to record{@link Statistics}.
* @param left
* Matrix A
* @param right
* Matrix B
* @param outputName
* Name of the output matrix C (in code generated after LOP
* layer)
* @param isLeftTransposed
* op for A, transposed or not
* @param isRightTransposed
* op for B, tranposed or not
* @return output of matrix multiply
*/
public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) {
boolean isM1Sparse = isInSparseFormat(gCtx, left);
boolean isM2Sparse = isInSparseFormat(gCtx, right);
MatrixObject output = ec.getMatrixObject(outputName);
long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(), right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);
if (isM1Sparse && isM2Sparse) {
// -------------------------------------------------------------------------------------
// sparse-sparse matrix multiplication
params.validate();
int transa = cusparseOp(isLeftTransposed);
int transb = cusparseOp(isRightTransposed);
// Step 1: Allocate output => sparse format
ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
// Step 2: Get the handles to sparse/dense pointers for left, right
// and output
CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb, params.m, params.n, params.k);
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
// Step 3: Invoke the kernel
long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
// -------------------------------------------------------------------------------------
} else if (!isM1Sparse && isM2Sparse) {
// -------------------------------------------------------------------------------------
// dense-sparse matrix multiplication
// Step 1: Allocate output => dense format
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
// Step 2: Get the handles to sparse/dense pointers for left, right
// and output
Pointer A = getDensePointer(gCtx, left, instName);
CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer C = getDensePointer(gCtx, output, instName);
// Step 3: Invoke the kernel
denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
// -------------------------------------------------------------------------------------
} else if (isM1Sparse && !isM2Sparse) {
// -------------------------------------------------------------------------------------
// sparse-dense matrix multiplication
// Step 1: Allocate output => dense format
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
// Step 2: Get the handles to sparse/dense pointers for left, right
// and output
CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer B = getDensePointer(gCtx, right, instName);
Pointer C = getDensePointer(gCtx, output, instName);
// Step 3: Invoke the kernel
sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(), right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
// -------------------------------------------------------------------------------------
} else {
// -------------------------------------------------------------------------------------
// dense-dense matrix multiplication
// Step 1: Allocate output => dense format
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
// Step 2: Get the handles to sparse/dense pointers for left, right
// and output
Pointer A = getDensePointer(gCtx, left, instName);
Pointer B = getDensePointer(gCtx, right, instName);
Pointer C = getDensePointer(gCtx, output, instName);
// Step 3: Invoke the kernel
denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
// -------------------------------------------------------------------------------------
}
return output;
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class DataPartitionMR method processPartitionInstructions.
private static void processPartitionInstructions(String shuffleInst, MatrixObject[] inputMatrices, byte[] resultIndices, MatrixObject[] outputMatrices, int numReducers, int replication, MatrixCharacteristics[] sts) {
int i = 0;
for (String inst : shuffleInst.split(Instruction.INSTRUCTION_DELIM)) {
if (InstructionUtils.getOpCode(inst).equalsIgnoreCase("partition")) {
// long begin = System.currentTimeMillis();
String[] parts = InstructionUtils.getInstructionParts(inst);
int input_index = Integer.parseInt(parts[1]);
int output_index = Integer.parseInt(parts[2]);
MatrixObject in = inputMatrices[input_index];
MatrixObject out = outputMatrices[findResultIndex(resultIndices, output_index)];
PDataPartitionFormat pformat = PDataPartitionFormat.valueOf(parts[3]);
long rlen = in.getNumRows();
long clen = in.getNumColumns();
long brlen = in.getNumRowsPerBlock();
long bclen = in.getNumColumnsPerBlock();
long N = -1;
switch(pformat) {
case ROW_BLOCK_WISE_N:
{
long numRowBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / clen / brlen);
N = numRowBlocks * brlen;
break;
}
case COLUMN_BLOCK_WISE_N:
{
long numColBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / rlen / bclen);
N = numColBlocks * bclen;
break;
}
default:
throw new DMLRuntimeException("Unsupported partition format for distributed cache input: " + pformat);
}
PartitionFormat pf = new PartitionFormat(pformat, (int) N);
DataPartitioner dpart = new DataPartitionerRemoteMR(pf, -1, numReducers, replication, false, true);
out = dpart.createPartitionedMatrixObject(in, out, true);
sts[i] = out.getMatrixCharacteristics();
i++;
}
}
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class MLContextTest method testOutputMatrixObjectDML.
@Test
public void testOutputMatrixObjectDML() {
System.out.println("MLContextTest - output matrix object DML");
String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
MatrixObject mo = ml.execute(dml(s).out("M")).getMatrixObject("M");
RDD<String> rddStringCSV = MLContextConversionUtil.matrixObjectToRDDStringCSV(mo);
Iterator<String> iterator = rddStringCSV.toLocalIterator();
Assert.assertEquals("1.0,2.0", iterator.next());
Assert.assertEquals("3.0,4.0", iterator.next());
}
Aggregations