use of org.apache.sysml.runtime.instructions.gpu.context.GPUObject in project incubator-systemml by apache.
the class LibMatrixCUDA method solve.
/**
* Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
*
* @param ec a valid {@link ExecutionContext}
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix A
* @param in2 input matrix B
* @param outputName name of the output matrix
* @throws DMLRuntimeException if an error occurs
*/
public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
// x = solve(A, b)
LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
long t0 = -1;
if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {
// Both dense
GPUObject Aobj = in1.getGPUObject(gCtx);
GPUObject bobj = in2.getGPUObject(gCtx);
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
if ((int) in2.getNumRows() != m)
throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
if ((int) in2.getNumColumns() != 1)
throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
// and are destructive to the original input
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
GPUObject ATobj = (GPUObject) Aobj.clone();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
ATobj.denseRowMajorToColumnMajor();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
Pointer A = ATobj.getJcudaDenseMatrixPtr();
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
GPUObject bTobj = (GPUObject) bobj.clone();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
bTobj.denseRowMajorToColumnMajor();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
Pointer b = bTobj.getJcudaDenseMatrixPtr();
// step 3: query working space of geqrf and ormqr
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
int[] lwork = { 0 };
JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
// step 4: compute QR factorization
Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
Pointer tau = gCtx.allocate(instName, Math.max(m, m) * Sizeof.DOUBLE);
Pointer devInfo = gCtx.allocate(Sizeof.INT);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
int[] qrError = { -1 };
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 5: compute Q^T*B
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 6: compute x = R \ Q^T*B
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
JCublas2.cublasDtrsm(gCtx.getCublasHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, n, 1, pointerTo(1.0), A, m, b, m);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
bTobj.denseColumnMajorToRowMajor();
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
// TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
// There is an avoidable copy happening here
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
gCtx.cudaFreeHelper(instName, work);
gCtx.cudaFreeHelper(instName, tau);
ATobj.clearData();
bTobj.clearData();
//debugPrintMatrix(b, n, 1);
} else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) {
// Both sparse
throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
} else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) {
// Pointer B = getDensePointer(gCtx, in2, instName);
throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
} else if (isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {
// A is sparse, b is dense
throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUObject in project incubator-systemml by apache.
the class CacheableData method exportData.
/**
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
* If all threads export the same data object concurrently it results in errors
* because they all write to the same file. Efficiency for loops and parallel threads
* is achieved by checking if the in-memory block is dirty.
*
* NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore
* the output format and most importantly would bypass reblocking during write (which effects the
* potential degree of parallelism). However, we copy files on HDFS if certain criteria are given.
*
* @param fName file name
* @param outputFormat format
* @param replication ?
* @param formatProperties file format properties
* @throws CacheException if CacheException occurs
*/
public synchronized void exportData(String fName, String outputFormat, int replication, FileFormatProperties formatProperties) throws CacheException {
if (LOG.isTraceEnabled())
LOG.trace("Export data " + getVarName() + " " + fName);
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
//prevent concurrent modifications
if (!isAvailableToRead())
throw new CacheException("MatrixObject not available to read.");
LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat);
//TODO remove
boolean copiedFromGPU = false;
for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
GPUObject gObj = kv.getValue();
if (gObj != null && copiedFromGPU && gObj.isDirty()) {
LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
} else if (gObj != null) {
copiedFromGPU = gObj.acquireHostRead();
if (_data == null)
getCache();
}
}
// !fName.equals(_hdfsFileName); //persistent write flag
boolean pWrite = false;
if (fName.equals(_hdfsFileName)) {
setHDFSFileExists(true);
pWrite = false;
} else {
// i.e., export is called from "write" instruction
pWrite = true;
}
//actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism))
if (//use dirty for skipping parallel exports
isDirty() || (pWrite && !isEqualOutputFormat(outputFormat))) {
// a) get the matrix
if (isEmpty(true)) {
//note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here)
try {
if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead())
_data = readBlobFromHDFS(_hdfsFileName);
else
_data = readBlobFromRDD(getRDDHandle(), new MutableBoolean());
setDirty(false);
} catch (IOException e) {
throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e);
}
}
//get object from cache
if (_data == null)
getCache();
//incl. read matrix if evicted
acquire(false, _data == null);
// b) write the matrix
try {
writeMetaData(fName, outputFormat, formatProperties);
writeBlobToHDFS(fName, outputFormat, replication, formatProperties);
if (!pWrite)
setDirty(false);
} catch (Exception e) {
throw new CacheException("Export to " + fName + " failed.", e);
} finally {
release();
}
} else if (// pwrite with same output format
pWrite) {
//CASE 2: matrix already in same format but different file on hdfs (copy matrix to fname)
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
MapReduceTool.deleteFileIfExistOnHDFS(fName + ".mtd");
if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead())
MapReduceTool.copyFileOnHDFS(_hdfsFileName, fName);
else
//write might trigger rdd operations and nnz maintenance
writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat);
writeMetaData(fName, outputFormat, formatProperties);
} catch (Exception e) {
throw new CacheException("Export to " + fName + " failed.", e);
}
} else if (getRDDHandle() != null && getRDDHandle().isPending() && !getRDDHandle().isHDFSFile() && !getRDDHandle().allowsShortCircuitRead()) {
//CASE 3: pending rdd operation (other than checkpoints)
try {
//write matrix or frame
writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat);
writeMetaData(fName, outputFormat, formatProperties);
//update rdd status
getRDDHandle().setPending(false);
} catch (Exception e) {
throw new CacheException("Export to " + fName + " failed.", e);
}
} else {
//CASE 4: data already in hdfs (do nothing, no need for export)
LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists.");
}
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
CacheStatistics.incrementExportTime(t1 - t0);
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUObject in project incubator-systemml by apache.
the class CacheableData method acquireRead.
// *********************************************
// *** ***
// *** HIGH-LEVEL METHODS THAT SPECIFY ***
// *** THE LOCKING AND CACHING INTERFACE ***
// *** ***
// *********************************************
/**
* Acquires a shared "read-only" lock, produces the reference to the cache block,
* restores the cache block to main memory, reads from HDFS if needed.
*
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
*
* In-Status: EMPTY, EVICTABLE, EVICTED, READ;
* Out-Status: READ(+1).
*
* @return cacheable data
* @throws CacheException if CacheException occurs
*/
public synchronized T acquireRead() throws CacheException {
if (LOG.isTraceEnabled())
LOG.trace("Acquire read " + getVarName());
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if (!isAvailableToRead())
throw new CacheException("MatrixObject not available to read.");
//get object from cache
if (_data == null)
getCache();
//call acquireHostRead if gpuHandle is set as well as is allocated
boolean copiedFromGPU = false;
for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
GPUObject gObj = kv.getValue();
if (gObj != null && copiedFromGPU && gObj.isDirty()) {
LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
} else if (gObj != null) {
copiedFromGPU = gObj.acquireHostRead();
if (_data == null)
getCache();
}
}
//(probe data for cache_nowrite / jvm_reuse)
if (isEmpty(true) && _data == null) {
try {
if (DMLScript.STATISTICS)
CacheStatistics.incrementHDFSHits();
if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) {
//check filename
if (_hdfsFileName == null)
throw new CacheException("Cannot read matrix for empty filename.");
//read cacheable data from hdfs
_data = readBlobFromHDFS(_hdfsFileName);
//mark for initial local write despite read operation
_requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ;
} else {
//read matrix from rdd (incl execute pending rdd operations)
MutableBoolean writeStatus = new MutableBoolean();
_data = readBlobFromRDD(getRDDHandle(), writeStatus);
//mark for initial local write (prevent repeated execution of rdd operations)
_requiresLocalWrite = writeStatus.booleanValue() ? CACHING_WRITE_CACHE_ON_READ : true;
}
setDirty(false);
} catch (IOException e) {
throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e);
}
_isAcquireFromEmpty = true;
} else if (DMLScript.STATISTICS) {
if (_data != null)
CacheStatistics.incrementMemHits();
}
//cache status maintenance
acquire(false, _data == null);
updateStatusPinned(true);
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireRTime(t1 - t0);
}
return _data;
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUObject in project incubator-systemml by apache.
the class ExecutionContext method allocateGPUMatrixObject.
/**
* Allocates the {@link GPUObject} for a given LOPS Variable (eg. _mVar3)
* @param varName variable name
* @return matrix object
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public MatrixObject allocateGPUMatrixObject(String varName) throws DMLRuntimeException {
MatrixObject mo = getMatrixObject(varName);
if (mo.getGPUObject(getGPUContext()) == null) {
GPUObject newGObj = getGPUContext().createGPUObject(mo);
// The lock is added here for an output block
// so that any block currently in use is not deallocated by eviction on the GPU
newGObj.addLock();
mo.setGPUObject(getGPUContext(), newGObj);
}
return mo;
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUObject in project incubator-systemml by apache.
the class LibMatrixCUDA method unaryOp.
/**
* A helper function for all Unary ops (sqrt, abs, sin.. etc)
* @param ec valid execution context
* @param gCtx a valid {@link GPUContext}
* @param in1 input matrix
* @param kernel name of CUDA kernel for the unary op to execute
* @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block
* @param outputName output matrix name
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param kernelTimer the name of the timer to measure the kernel invocation
* @throws DMLRuntimeException
*/
private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
if (ec.getGPUContext() != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
GPUObject in = in1.getGPUObject(gCtx);
boolean isSparseAndEmpty = in.isSparseAndEmpty();
long t1 = 0;
if (isSparseAndEmpty) {
MatrixObject out = ec.getMatrixObject(outputName);
ec.allocateGPUMatrixObject(outputName);
out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
} else {
// Dense
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer output = getDensePointer(gCtx, out, instName);
Pointer input = getDensePointer(gCtx, in1, instName);
int size = (int) (in1.getNumColumns() * in1.getNumRows());
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
}
}
Aggregations