use of org.apache.sysml.runtime.DMLRuntimeException in project incubator-systemml by apache.
the class LibMatrixCUDA method denseDenseMatmult.
/**
* Dense-dense matrix multiply
* C = op(A) * op(B), A and B are dense matrices
* On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
* What we have as input is t(A) and t(B), t(X) = transpose of X.
* We do t(B) %*% t(A) to get t(C);
* If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
* What we really want is t(C). This we already have as the result of t(B) %*% t(A).
* @param gCtx a valid {@link GPUContext}
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param output output allocated on GPU in column major format
* @param leftRows1 number of rows in A
* @param leftCols1 number of cols in A
* @param rightRows1 number of rows in B
* @param rightCols1 number of cols in B
* @param isLeftTransposed1 op for A, transposed or not
* @param isRightTransposed1 op for B, transposed or not
* @param leftPtr A allocated on the GPU in row-major format
* @param rightPtr B allocated on the GPU in row-major format
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException {
LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
Pointer A = rightPtr;
Pointer B = leftPtr;
// To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects)
int leftRows = rightCols1;
int leftCols = rightRows1;
int rightRows = leftCols1;
int rightCols = leftRows1;
boolean isLeftTransposed = isRightTransposed1;
boolean isRightTransposed = isLeftTransposed1;
// Note: the dimensions are swapped
int m = isLeftTransposed ? leftCols : leftRows;
int n = isRightTransposed ? rightRows : rightCols;
int k = isLeftTransposed ? leftRows : leftCols;
int k1 = isRightTransposed ? rightCols : rightRows;
if (k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if (m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
double[] one = { 1 };
double[] zero = { 0 };
//int lda = leftRows;
//int ldb = leftCols;
int lda = isLeftTransposed ? k : m;
int ldb = isRightTransposed ? n : k;
int ldc = m;
int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
Pointer C = output;
if (m == 1 && n == 1) {
// Vector product
LOG.debug(" GPU Dense-dense Vector Product");
double[] result = { 0 };
JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result));
// By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
// This means that scalar values passed are on host (as opposed to on device).
// The result is copied from the host back to the device so that the rest of
// infrastructure can treat it uniformly.
cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
} else if (m == 1) {
// Vector-matrix multiply
LOG.debug(" GPU Dense Vector-Matrix Multiply");
transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
} else if (n == 1) {
// Matrix-vector multiply
LOG.debug(" GPU Dense Matrix-Vector Multiply");
JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
} else {
LOG.debug(" GPU Dense-Dense Matrix Multiply ");
JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
}
}
use of org.apache.sysml.runtime.DMLRuntimeException in project incubator-systemml by apache.
the class LibMatrixCUDA method eitherSparseMatmult.
/**
* One of the matrices is sparse, the other dense
* C = op(A) x op(B)
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output object for C on host to which GPU output will be attached
* @param left Matrix A on host
* @param right Matrix B on host
* @param isLeftTransposed op for A, tranposed or not
* @param isRightTransposed op for B, transposed or not
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
int n = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
int k1 = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows());
if (k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if (m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
if (left.getGPUObject(gCtx).isSparse()) {
// Left sparse, right dense
sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k);
} else {
// Left dense, right sparse
denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k);
}
}
use of org.apache.sysml.runtime.DMLRuntimeException in project incubator-systemml by apache.
the class LibMatrixCUDA method allocateNCHWDescriptors.
/**
* Convenient utility for batch normalization that returns a NCHW descriptor
* @param gCtx a valid {@link GPUContext}
* @param N number of images
* @param C number of channels
* @param CHW channels*height*width
* @param input input matrix objects
* @param output output matrix objects
* @return one of the NCHW descriptor
* @throws DMLRuntimeException if error occurs
*/
private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject[] input, MatrixObject[] output) throws DMLRuntimeException {
// Return any one
cudnnTensorDescriptor ret = null;
if (CHW > ((long) Integer.MAX_VALUE) * C) {
throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE);
}
cudnnTensorDescriptor knownNCHWdescriptor = null;
int H = -1;
int W = -1;
for (int i = 0; i < input.length; i++) {
knownNCHWdescriptor = input[i].getGPUObject(gCtx).getTensorDescriptor();
if (knownNCHWdescriptor != null) {
int[] shape = input[i].getGPUObject(gCtx).getTensorShape();
if (shape[0] != N || shape[1] != C) {
throw new DMLRuntimeException("Incorrect N and C:" + shape[0] + " != " + N + " || " + shape[1] + " != " + C);
}
H = shape[2];
W = shape[3];
break;
}
}
if (knownNCHWdescriptor != null) {
// We precisely know N, C, H, W
for (int i = 0; i < input.length; i++) {
ret = allocateTensorDescriptor(gCtx, input[i], N, C, H, W);
}
for (int i = 0; i < output.length; i++) {
ret = allocateTensorDescriptor(gCtx, output[i], N, C, H, W);
}
} else {
int HW = (int) (CHW / C);
// If not known
H = HW;
// If not known
W = 1;
double potentialH = Math.sqrt(HW);
if (potentialH == ((int) potentialH)) {
H = (int) potentialH;
W = H;
}
// We are not sure about H and W, hence don't allocate them.
ret = new cudnnTensorDescriptor();
cudnnCreateTensorDescriptor(ret);
cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
}
return ret;
}
use of org.apache.sysml.runtime.DMLRuntimeException in project incubator-systemml by apache.
the class ReaderBinaryBlockParallel method readBinaryBlockMatrixFromHDFS.
private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
try {
//create read tasks for all files
ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
ArrayList<ReadFileTask> tasks = new ArrayList<ReadFileTask>();
for (Path lpath : getSequenceFilePaths(fs, path)) {
ReadFileTask t = new ReadFileTask(lpath, job, fs, dest, rlen, clen, brlen, bclen);
tasks.add(t);
}
//wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
//check for exceptions and aggregate nnz
long lnnz = 0;
for (Future<Object> task : rt) lnnz += (Long) task.get();
//post-processing
dest.setNonZeros(lnnz);
if (dest.isInSparseFormat() && clen > bclen)
sortSparseRowsParallel(dest, rlen, _numThreads, pool);
pool.shutdown();
} catch (Exception e) {
throw new IOException("Failed parallel read of binary block input.", e);
}
}
use of org.apache.sysml.runtime.DMLRuntimeException in project incubator-systemml by apache.
the class DMLProgram method createRuntimeProgramBlock.
public ProgramBlock createRuntimeProgramBlock(Program prog, StatementBlock sb, DMLConfig config) throws IOException, LopsException, DMLRuntimeException {
Dag<Lop> dag = null;
Dag<Lop> pred_dag = null;
ArrayList<Instruction> instruct;
ArrayList<Instruction> pred_instruct = null;
ProgramBlock retPB = null;
// process While Statement - add runtime program blocks to program
if (sb instanceof WhileStatementBlock) {
// create DAG for loop predicates
pred_dag = new Dag<Lop>();
((WhileStatementBlock) sb).get_predicateLops().addToDag(pred_dag);
// create instructions for loop predicates
pred_instruct = new ArrayList<Instruction>();
ArrayList<Instruction> pInst = pred_dag.getJobs(null, config);
for (Instruction i : pInst) {
pred_instruct.add(i);
}
// create while program block
WhileProgramBlock rtpb = new WhileProgramBlock(prog, pred_instruct);
if (rtpb.getPredicateResultVar() == null) {
// e.g case : WHILE(continue)
if (((WhileStatementBlock) sb).get_predicateLops().getExecLocation() == LopProperties.ExecLocation.Data) {
String resultVar = ((WhileStatementBlock) sb).get_predicateLops().getOutputParameters().getLabel();
rtpb.setPredicateResultVar(resultVar);
} else {
LOG.error(sb.printBlockErrorLocation() + "Error in translating the WHILE predicate.");
throw new LopsException(sb.printBlockErrorLocation() + "Error in translating the WHILE predicate.");
}
}
//// process the body of the while statement block ////
WhileStatementBlock wsb = (WhileStatementBlock) sb;
if (wsb.getNumStatements() > 1) {
LOG.error(wsb.printBlockErrorLocation() + "WhileStatementBlock should only have 1 statement");
throw new LopsException(wsb.printBlockErrorLocation() + "WhileStatementBlock should only have 1 statement");
}
WhileStatement wstmt = (WhileStatement) wsb.getStatement(0);
for (StatementBlock sblock : wstmt.getBody()) {
// process the body
ProgramBlock childBlock = createRuntimeProgramBlock(prog, sblock, config);
rtpb.addProgramBlock(childBlock);
}
// check there are actually Lops in to process (loop stmt body will not have any)
if (wsb.getLops() != null && !wsb.getLops().isEmpty()) {
LOG.error(wsb.printBlockErrorLocation() + "WhileStatementBlock should have no Lops");
throw new LopsException(wsb.printBlockErrorLocation() + "WhileStatementBlock should have no Lops");
}
retPB = rtpb;
//post processing for generating missing instructions
//retPB = verifyAndCorrectProgramBlock(sb.liveIn(), sb.liveOut(), sb._kill, retPB);
// add statement block
retPB.setStatementBlock(sb);
// add location information
retPB.setAllPositions(sb.getBeginLine(), sb.getBeginColumn(), sb.getEndLine(), sb.getEndColumn());
} else // process If Statement - add runtime program blocks to program
if (sb instanceof IfStatementBlock) {
// create DAG for loop predicates
pred_dag = new Dag<Lop>();
((IfStatementBlock) sb).get_predicateLops().addToDag(pred_dag);
// create instructions for loop predicates
pred_instruct = new ArrayList<Instruction>();
ArrayList<Instruction> pInst = pred_dag.getJobs(null, config);
for (Instruction i : pInst) {
pred_instruct.add(i);
}
// create if program block
IfProgramBlock rtpb = new IfProgramBlock(prog, pred_instruct);
if (rtpb.getPredicateResultVar() == null) {
// e.g case : If(continue)
if (((IfStatementBlock) sb).get_predicateLops().getExecLocation() == LopProperties.ExecLocation.Data) {
String resultVar = ((IfStatementBlock) sb).get_predicateLops().getOutputParameters().getLabel();
rtpb.setPredicateResultVar(resultVar);
} else {
LOG.error(sb.printBlockErrorLocation() + "Error in translating the IF predicate.");
throw new LopsException(sb.printBlockErrorLocation() + "Error in translating the IF predicate.");
}
}
// process the body of the if statement block
IfStatementBlock isb = (IfStatementBlock) sb;
if (isb.getNumStatements() > 1) {
LOG.error(isb.printBlockErrorLocation() + "IfStatementBlock should have only 1 statement");
throw new LopsException(isb.printBlockErrorLocation() + "IfStatementBlock should have only 1 statement");
}
IfStatement istmt = (IfStatement) isb.getStatement(0);
// process the if body
for (StatementBlock sblock : istmt.getIfBody()) {
ProgramBlock childBlock = createRuntimeProgramBlock(prog, sblock, config);
rtpb.addProgramBlockIfBody(childBlock);
}
// process the else body
for (StatementBlock sblock : istmt.getElseBody()) {
ProgramBlock childBlock = createRuntimeProgramBlock(prog, sblock, config);
rtpb.addProgramBlockElseBody(childBlock);
}
// check there are actually Lops in to process (loop stmt body will not have any)
if (isb.getLops() != null && !isb.getLops().isEmpty()) {
LOG.error(isb.printBlockErrorLocation() + "IfStatementBlock should have no Lops");
throw new LopsException(isb.printBlockErrorLocation() + "IfStatementBlock should have no Lops");
}
retPB = rtpb;
//post processing for generating missing instructions
//retPB = verifyAndCorrectProgramBlock(sb.liveIn(), sb.liveOut(), sb._kill, retPB);
// add statement block
retPB.setStatementBlock(sb);
// add location information
retPB.setAllPositions(sb.getBeginLine(), sb.getBeginColumn(), sb.getEndLine(), sb.getEndColumn());
} else // NOTE: applies to ForStatementBlock and ParForStatementBlock
if (sb instanceof ForStatementBlock) {
ForStatementBlock fsb = (ForStatementBlock) sb;
// create DAGs for loop predicates
Dag<Lop> fromDag = new Dag<Lop>();
Dag<Lop> toDag = new Dag<Lop>();
Dag<Lop> incrementDag = new Dag<Lop>();
if (fsb.getFromHops() != null)
fsb.getFromLops().addToDag(fromDag);
if (fsb.getToHops() != null)
fsb.getToLops().addToDag(toDag);
if (fsb.getIncrementHops() != null)
fsb.getIncrementLops().addToDag(incrementDag);
// create instructions for loop predicates
ArrayList<Instruction> fromInstructions = fromDag.getJobs(null, config);
ArrayList<Instruction> toInstructions = toDag.getJobs(null, config);
ArrayList<Instruction> incrementInstructions = incrementDag.getJobs(null, config);
// create for program block
String sbName = null;
ForProgramBlock rtpb = null;
IterablePredicate iterPred = fsb.getIterPredicate();
String[] iterPredData = IterablePredicate.createIterablePredicateVariables(iterPred.getIterVar().getName(), fsb.getFromLops(), fsb.getToLops(), fsb.getIncrementLops());
if (sb instanceof ParForStatementBlock) {
sbName = "ParForStatementBlock";
rtpb = new ParForProgramBlock(prog, iterPredData, iterPred.getParForParams());
ParForProgramBlock pfrtpb = (ParForProgramBlock) rtpb;
pfrtpb.setResultVariables(((ParForStatementBlock) sb).getResultVariables());
//used for optimization and creating unscoped variables
pfrtpb.setStatementBlock((ParForStatementBlock) sb);
} else //ForStatementBlock
{
sbName = "ForStatementBlock";
rtpb = new ForProgramBlock(prog, iterPredData);
}
rtpb.setFromInstructions(fromInstructions);
rtpb.setToInstructions(toInstructions);
rtpb.setIncrementInstructions(incrementInstructions);
rtpb.setIterablePredicateVars(iterPredData);
// process the body of the for statement block
if (fsb.getNumStatements() > 1) {
LOG.error(fsb.printBlockErrorLocation() + " " + sbName + " should have 1 statement");
throw new LopsException(fsb.printBlockErrorLocation() + " " + sbName + " should have 1 statement");
}
ForStatement fs = (ForStatement) fsb.getStatement(0);
for (StatementBlock sblock : fs.getBody()) {
ProgramBlock childBlock = createRuntimeProgramBlock(prog, sblock, config);
rtpb.addProgramBlock(childBlock);
}
// check there are actually Lops in to process (loop stmt body will not have any)
if (fsb.getLops() != null && !fsb.getLops().isEmpty()) {
LOG.error(fsb.printBlockErrorLocation() + sbName + " should have no Lops");
throw new LopsException(fsb.printBlockErrorLocation() + sbName + " should have no Lops");
}
retPB = rtpb;
//post processing for generating missing instructions
//retPB = verifyAndCorrectProgramBlock(sb.liveIn(), sb.liveOut(), sb._kill, retPB);
// add statement block
retPB.setStatementBlock(sb);
// add location information
retPB.setAllPositions(sb.getBeginLine(), sb.getBeginColumn(), sb.getEndLine(), sb.getEndColumn());
} else // process function statement block - add runtime program blocks to program
if (sb instanceof FunctionStatementBlock) {
FunctionStatementBlock fsb = (FunctionStatementBlock) sb;
if (fsb.getNumStatements() > 1) {
LOG.error(fsb.printBlockErrorLocation() + "FunctionStatementBlock should only have 1 statement");
throw new LopsException(fsb.printBlockErrorLocation() + "FunctionStatementBlock should only have 1 statement");
}
FunctionStatement fstmt = (FunctionStatement) fsb.getStatement(0);
FunctionProgramBlock rtpb = null;
if (fstmt instanceof ExternalFunctionStatement) {
// create external function program block
String execType = ((ExternalFunctionStatement) fstmt).getOtherParams().get(ExternalFunctionStatement.EXEC_TYPE);
boolean isCP = (execType.equals(ExternalFunctionStatement.IN_MEMORY)) ? true : false;
String scratchSpaceLoc = null;
try {
scratchSpaceLoc = config.getTextValue(DMLConfig.SCRATCH_SPACE);
} catch (Exception e) {
LOG.error(fsb.printBlockErrorLocation() + "could not retrieve parameter " + DMLConfig.SCRATCH_SPACE + " from DMLConfig");
}
StringBuilder buff = new StringBuilder();
buff.append(scratchSpaceLoc);
buff.append(Lop.FILE_SEPARATOR);
buff.append(Lop.PROCESS_PREFIX);
buff.append(DMLScript.getUUID());
buff.append(Lop.FILE_SEPARATOR);
buff.append(ProgramConverter.CP_ROOT_THREAD_ID);
buff.append(Lop.FILE_SEPARATOR);
buff.append("PackageSupport");
buff.append(Lop.FILE_SEPARATOR);
String basedir = buff.toString();
if (isCP) {
rtpb = new ExternalFunctionProgramBlockCP(prog, fstmt.getInputParams(), fstmt.getOutputParams(), ((ExternalFunctionStatement) fstmt).getOtherParams(), basedir);
} else {
rtpb = new ExternalFunctionProgramBlock(prog, fstmt.getInputParams(), fstmt.getOutputParams(), ((ExternalFunctionStatement) fstmt).getOtherParams(), basedir);
}
if (!fstmt.getBody().isEmpty()) {
LOG.error(fstmt.printErrorLocation() + "ExternalFunctionStatementBlock should have no statement blocks in body");
throw new LopsException(fstmt.printErrorLocation() + "ExternalFunctionStatementBlock should have no statement blocks in body");
}
} else {
// create function program block
rtpb = new FunctionProgramBlock(prog, fstmt.getInputParams(), fstmt.getOutputParams());
// process the function statement body
for (StatementBlock sblock : fstmt.getBody()) {
// process the body
ProgramBlock childBlock = createRuntimeProgramBlock(prog, sblock, config);
rtpb.addProgramBlock(childBlock);
}
}
// check there are actually Lops in to process (loop stmt body will not have any)
if (fsb.getLops() != null && !fsb.getLops().isEmpty()) {
LOG.error(fsb.printBlockErrorLocation() + "FunctionStatementBlock should have no Lops");
throw new LopsException(fsb.printBlockErrorLocation() + "FunctionStatementBlock should have no Lops");
}
retPB = rtpb;
// add location information
retPB.setAllPositions(sb.getBeginLine(), sb.getBeginColumn(), sb.getEndLine(), sb.getEndColumn());
} else {
// handle general case
ProgramBlock rtpb = new ProgramBlock(prog);
// DAGs for Lops
dag = new Dag<Lop>();
// check there are actually Lops in to process (loop stmt body will not have any)
if (sb.getLops() != null && !sb.getLops().isEmpty()) {
for (Lop l : sb.getLops()) {
l.addToDag(dag);
}
// Instructions for Lobs DAGs
instruct = dag.getJobs(sb, config);
rtpb.addInstructions(instruct);
}
/*// TODO: check with Doug
// add instruction for a function call
if (sb.getFunctionCallInst() != null){
rtpb.addInstruction(sb.getFunctionCallInst());
}*/
retPB = rtpb;
//post processing for generating missing instructions
//retPB = verifyAndCorrectProgramBlock(sb.liveIn(), sb.liveOut(), sb._kill, retPB);
// add statement block
retPB.setStatementBlock(sb);
// add location information
retPB.setAllPositions(sb.getBeginLine(), sb.getBeginColumn(), sb.getEndLine(), sb.getEndColumn());
}
return retPB;
}
Aggregations