use of org.apache.sysml.runtime.instructions.gpu.context.GPUContext in project incubator-systemml by apache.
the class GPUTests method clearGPUMemory.
/**
* Clear out the memory on all GPUs
*/
protected void clearGPUMemory() {
try {
int count = GPUContextPool.getDeviceCount();
int freeCount = GPUContextPool.getAvailableCount();
Assert.assertTrue("All GPUContexts have not been returned to the GPUContextPool", count == freeCount);
ArrayList<GPUContext> gpuContexts = new ArrayList<>();
for (int i = 0; i < count; i++) {
GPUContext gCtx = GPUContextPool.getFromPool();
gCtx.initializeThread();
gCtx.clearMemory();
gpuContexts.add(gCtx);
}
for (GPUContext gCtx : gpuContexts) {
GPUContextPool.returnToPool(gCtx);
}
} catch (DMLRuntimeException e) {
// Ignore
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUContext in project incubator-systemml by apache.
the class ParForProgramBlock method executeLocalParFor.
/**
* Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
* This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
* below for details of the realization.
*
* @param ec execution context
* @param itervar ?
* @param from ?
* @param to ?
* @param incr ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws InterruptedException if InterruptedException occurs
*/
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, InterruptedException {
LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
* put tasks into queue
* mark end of task input stream
* Step 3) join all threads (wait for finished work)
* Step 4) collect results from each parallel worker
*/
Timing time = new Timing(true);
int numExecutedTasks = 0;
int numExecutedIterations = 0;
//restrict recompilation to thread local memory
setMemoryBudget();
//enable runtime piggybacking if required
if (_enableRuntimePiggybacking)
//default piggybacking worker
RuntimePiggybacking.start(_numThreads);
try {
// Step 1) init parallel workers, task queue and threads
LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
Thread[] threads = new Thread[_numThreads];
LocalParWorker[] workers = new LocalParWorker[_numThreads];
for (int i = 0; i < _numThreads; i++) {
//create parallel workers as (lazy) deep copies
//including preparation of update-in-place variables
workers[i] = createParallelWorker(_pwIDs[i], queue, ec);
threads[i] = new Thread(workers[i]);
threads[i].setPriority(Thread.MAX_PRIORITY);
}
// start threads (from now on waiting for tasks)
for (Thread thread : threads) thread.start();
//maintain statistics
long tinit = (long) time.stop();
if (DMLScript.STATISTICS)
Statistics.incrementParForInitTime(tinit);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
//put tasks into queue (parworker start work on first tasks while creating tasks)
numCreatedTasks = partitioner.createTasks(queue);
} else {
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
// put tasks into queue
for (Task t : tasks) queue.enqueueTask(t);
// mark end of task input stream
queue.closeInput();
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// Step 3) join all threads (wait for finished work)
for (Thread thread : threads) thread.join();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
//obtain results
LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
for (int i = 0; i < _numThreads; i++) {
localVariables[i] = workers[i].getVariables();
numExecutedTasks += workers[i].getExecutedTasks();
numExecutedIterations += workers[i].getExecutedIterations();
}
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
// Step 5) cleanup local parworkers (e.g., remove created functions)
for (int i = 0; i < _numThreads; i++) {
Collection<String> fnNames = workers[i].getFunctionNames();
if (fnNames != null && !fnNames.isEmpty())
for (String fn : fnNames) {
String[] parts = DMLProgram.splitFunctionKey(fn);
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
for (int i = 0; i < _numThreads; i++) {
GPUContext gCtx = workers[i].getExecutionContext().getGPUContext();
GPUContextPool.returnToPool(gCtx);
}
ec.setGPUContext(GPUContextPool.getFromPool());
ec.getGPUContext().initializeThread();
}
} finally {
//remove thread-local memory budget (reset to original budget)
//(in finally to prevent error side effects for multiple scripts in one jvm)
resetMemoryBudget();
//disable runtime piggybacking
if (_enableRuntimePiggybacking)
RuntimePiggybacking.stop();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUContext in project incubator-systemml by apache.
the class CacheableData method exportData.
/**
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
* If all threads export the same data object concurrently it results in errors
* because they all write to the same file. Efficiency for loops and parallel threads
* is achieved by checking if the in-memory block is dirty.
*
* NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore
* the output format and most importantly would bypass reblocking during write (which effects the
* potential degree of parallelism). However, we copy files on HDFS if certain criteria are given.
*
* @param fName file name
* @param outputFormat format
* @param replication ?
* @param formatProperties file format properties
* @throws CacheException if CacheException occurs
*/
public synchronized void exportData(String fName, String outputFormat, int replication, FileFormatProperties formatProperties) throws CacheException {
if (LOG.isTraceEnabled())
LOG.trace("Export data " + getVarName() + " " + fName);
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
//prevent concurrent modifications
if (!isAvailableToRead())
throw new CacheException("MatrixObject not available to read.");
LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat);
//TODO remove
boolean copiedFromGPU = false;
for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
GPUObject gObj = kv.getValue();
if (gObj != null && copiedFromGPU && gObj.isDirty()) {
LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
} else if (gObj != null) {
copiedFromGPU = gObj.acquireHostRead();
if (_data == null)
getCache();
}
}
// !fName.equals(_hdfsFileName); //persistent write flag
boolean pWrite = false;
if (fName.equals(_hdfsFileName)) {
setHDFSFileExists(true);
pWrite = false;
} else {
// i.e., export is called from "write" instruction
pWrite = true;
}
//actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism))
if (//use dirty for skipping parallel exports
isDirty() || (pWrite && !isEqualOutputFormat(outputFormat))) {
// a) get the matrix
if (isEmpty(true)) {
//note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here)
try {
if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead())
_data = readBlobFromHDFS(_hdfsFileName);
else
_data = readBlobFromRDD(getRDDHandle(), new MutableBoolean());
setDirty(false);
} catch (IOException e) {
throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e);
}
}
//get object from cache
if (_data == null)
getCache();
//incl. read matrix if evicted
acquire(false, _data == null);
// b) write the matrix
try {
writeMetaData(fName, outputFormat, formatProperties);
writeBlobToHDFS(fName, outputFormat, replication, formatProperties);
if (!pWrite)
setDirty(false);
} catch (Exception e) {
throw new CacheException("Export to " + fName + " failed.", e);
} finally {
release();
}
} else if (// pwrite with same output format
pWrite) {
//CASE 2: matrix already in same format but different file on hdfs (copy matrix to fname)
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
MapReduceTool.deleteFileIfExistOnHDFS(fName + ".mtd");
if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead())
MapReduceTool.copyFileOnHDFS(_hdfsFileName, fName);
else
//write might trigger rdd operations and nnz maintenance
writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat);
writeMetaData(fName, outputFormat, formatProperties);
} catch (Exception e) {
throw new CacheException("Export to " + fName + " failed.", e);
}
} else if (getRDDHandle() != null && getRDDHandle().isPending() && !getRDDHandle().isHDFSFile() && !getRDDHandle().allowsShortCircuitRead()) {
//CASE 3: pending rdd operation (other than checkpoints)
try {
//write matrix or frame
writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat);
writeMetaData(fName, outputFormat, formatProperties);
//update rdd status
getRDDHandle().setPending(false);
} catch (Exception e) {
throw new CacheException("Export to " + fName + " failed.", e);
}
} else {
//CASE 4: data already in hdfs (do nothing, no need for export)
LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists.");
}
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
CacheStatistics.incrementExportTime(t1 - t0);
}
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUContext in project incubator-systemml by apache.
the class CacheableData method acquireRead.
// *********************************************
// *** ***
// *** HIGH-LEVEL METHODS THAT SPECIFY ***
// *** THE LOCKING AND CACHING INTERFACE ***
// *** ***
// *********************************************
/**
* Acquires a shared "read-only" lock, produces the reference to the cache block,
* restores the cache block to main memory, reads from HDFS if needed.
*
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
*
* In-Status: EMPTY, EVICTABLE, EVICTED, READ;
* Out-Status: READ(+1).
*
* @return cacheable data
* @throws CacheException if CacheException occurs
*/
public synchronized T acquireRead() throws CacheException {
if (LOG.isTraceEnabled())
LOG.trace("Acquire read " + getVarName());
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if (!isAvailableToRead())
throw new CacheException("MatrixObject not available to read.");
//get object from cache
if (_data == null)
getCache();
//call acquireHostRead if gpuHandle is set as well as is allocated
boolean copiedFromGPU = false;
for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
GPUObject gObj = kv.getValue();
if (gObj != null && copiedFromGPU && gObj.isDirty()) {
LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
} else if (gObj != null) {
copiedFromGPU = gObj.acquireHostRead();
if (_data == null)
getCache();
}
}
//(probe data for cache_nowrite / jvm_reuse)
if (isEmpty(true) && _data == null) {
try {
if (DMLScript.STATISTICS)
CacheStatistics.incrementHDFSHits();
if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) {
//check filename
if (_hdfsFileName == null)
throw new CacheException("Cannot read matrix for empty filename.");
//read cacheable data from hdfs
_data = readBlobFromHDFS(_hdfsFileName);
//mark for initial local write despite read operation
_requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ;
} else {
//read matrix from rdd (incl execute pending rdd operations)
MutableBoolean writeStatus = new MutableBoolean();
_data = readBlobFromRDD(getRDDHandle(), writeStatus);
//mark for initial local write (prevent repeated execution of rdd operations)
_requiresLocalWrite = writeStatus.booleanValue() ? CACHING_WRITE_CACHE_ON_READ : true;
}
setDirty(false);
} catch (IOException e) {
throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e);
}
_isAcquireFromEmpty = true;
} else if (DMLScript.STATISTICS) {
if (_data != null)
CacheStatistics.incrementMemHits();
}
//cache status maintenance
acquire(false, _data == null);
updateStatusPinned(true);
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireRTime(t1 - t0);
}
return _data;
}
use of org.apache.sysml.runtime.instructions.gpu.context.GPUContext in project incubator-systemml by apache.
the class ExecutionContext method getMatrixInputForGPUInstruction.
public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException {
GPUContext gCtx = getGPUContext();
boolean copied = false;
MatrixObject mo = getMatrixObject(varName);
if (mo == null) {
throw new DMLRuntimeException("No matrix object available for variable:" + varName);
}
boolean acquired = false;
if (mo.getGPUObject(gCtx) == null) {
GPUObject newGObj = gCtx.createGPUObject(mo);
mo.setGPUObject(gCtx, newGObj);
} else if (!mo.getGPUObject(gCtx).isInputAllocated()) {
mo.acquireRead();
acquired = true;
}
copied = mo.getGPUObject(gCtx).acquireDeviceRead();
if (acquired) {
mo.release();
}
return new Pair<MatrixObject, Boolean>(mo, copied);
}
Aggregations