use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project incubator-systemml by apache.
the class ParForProgramBlock method createParallelWorker.
/**
* Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child
* program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced
* anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous
* executions of this parfor.
*
* @param pwID parworker id
* @param queue task queue
* @param ec execution context
* @param index the index of the worker
* @return local parworker
*/
private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index) {
LocalParWorker pw = null;
try {
// create deep copies of required elements child blocks
ArrayList<ProgramBlock> cpChildBlocks = null;
HashSet<String> fnNames = new HashSet<>();
if (USE_PB_CACHE) {
if (_pbcache.containsKey(pwID)) {
cpChildBlocks = _pbcache.get(pwID);
} else {
cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
_pbcache.put(pwID, cpChildBlocks);
}
} else {
cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
}
// deep copy execution context (including prepare parfor update-in-place)
ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
// and sets it in the ExecutionContext of the parfor
if (DMLScript.USE_ACCELERATOR) {
cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
}
// prepare basic update-in-place variables (vars dropped on result merge)
prepareUpdateInPlaceVariables(cpEc, pwID);
// copy compiler configuration (for jmlc w/o global config)
CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
// create the actual parallel worker
ParForBody body = new ParForBody(cpChildBlocks, _resultVars, cpEc);
pw = new LocalParWorker(pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor);
pw.setFunctionNames(fnNames);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
return pw;
}
use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project incubator-systemml by apache.
the class ParForProgramBlock method executeLocalParFor.
/**
* Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
* This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
* below for details of the realization.
*
* @param ec execution context
* @param itervar ?
* @param from ?
* @param to ?
* @param incr ?
* @throws InterruptedException if InterruptedException occurs
*/
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws InterruptedException {
LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
* put tasks into queue
* mark end of task input stream
* Step 3) join all threads (wait for finished work)
* Step 4) collect results from each parallel worker
*/
Timing time = new Timing(true);
int numExecutedTasks = 0;
int numExecutedIterations = 0;
// restrict recompilation to thread local memory
setMemoryBudget();
// enable runtime piggybacking if required
if (_enableRuntimePiggybacking)
// default piggybacking worker
RuntimePiggybacking.start(_numThreads);
try {
// Step 1) create task queue and init workers in parallel
// (including preparation of update-in-place variables)
LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
Thread[] threads = new Thread[_numThreads];
LocalParWorker[] workers = new LocalParWorker[_numThreads];
IntStream.range(0, _numThreads).parallel().forEach(i -> {
workers[i] = createParallelWorker(_pwIDs[i], queue, ec, i);
threads[i] = new Thread(workers[i]);
threads[i].setPriority(Thread.MAX_PRIORITY);
});
// start threads (from now on waiting for tasks)
for (Thread thread : threads) thread.start();
// maintain statistics
long tinit = (long) time.stop();
if (DMLScript.STATISTICS)
Statistics.incrementParForInitTime(tinit);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
// put tasks into queue (parworker start work on first tasks while creating tasks)
numCreatedTasks = partitioner.createTasks(queue);
} else {
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
// put tasks into queue
for (Task t : tasks) queue.enqueueTask(t);
// mark end of task input stream
queue.closeInput();
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// Step 3) join all threads (wait for finished work)
for (Thread thread : threads) thread.join();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
// obtain results and cleanup other intermediates before result merge
LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
for (int i = 0; i < _numThreads; i++) {
localVariables[i] = workers[i].getVariables();
localVariables[i].removeAllNotIn(_resultVars.stream().map(v -> v._name).collect(Collectors.toSet()));
numExecutedTasks += workers[i].getExecutedTasks();
numExecutedIterations += workers[i].getExecutedIterations();
}
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
// Step 5) cleanup local parworkers (e.g., remove created functions)
for (int i = 0; i < _numThreads; i++) {
Collection<String> fnNames = workers[i].getFunctionNames();
if (fnNames != null && !fnNames.isEmpty())
for (String fn : fnNames) {
String[] parts = DMLProgram.splitFunctionKey(fn);
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
ec.getGPUContext(0).initializeThread();
}
} finally {
// remove thread-local memory budget (reset to original budget)
// (in finally to prevent error side effects for multiple scripts in one jvm)
resetMemoryBudget();
// disable runtime piggybacking
if (_enableRuntimePiggybacking)
RuntimePiggybacking.stop();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project incubator-systemml by apache.
the class ParForProgramBlock method createParallelWorker.
/**
* Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child
* program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced
* anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous
* executions of this parfor.
*
* @param pwID parworker id
* @param queue task queue
* @param ec execution context
* @return local parworker
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec) throws DMLRuntimeException {
LocalParWorker pw = null;
try {
//create deep copies of required elements child blocks
ArrayList<ProgramBlock> cpChildBlocks = null;
HashSet<String> fnNames = new HashSet<String>();
if (USE_PB_CACHE) {
if (_pbcache.containsKey(pwID)) {
cpChildBlocks = _pbcache.get(pwID);
} else {
cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
_pbcache.put(pwID, cpChildBlocks);
}
} else {
cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
}
//deep copy execution context (including prepare parfor update-in-place)
ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
// and sets it in the ExecutionContext
if (DMLScript.USE_ACCELERATOR) {
cpEc.setGPUContext(GPUContextPool.getFromPool());
}
//prepare basic update-in-place variables (vars dropped on result merge)
prepareUpdateInPlaceVariables(cpEc, pwID);
//copy compiler configuration (for jmlc w/o global config)
CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
//create the actual parallel worker
ParForBody body = new ParForBody(cpChildBlocks, _resultVars, cpEc);
pw = new LocalParWorker(pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor);
pw.setFunctionNames(fnNames);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
return pw;
}
use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project systemml by apache.
the class ParForProgramBlock method executeLocalParFor.
/**
* Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
* This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
* below for details of the realization.
*
* @param ec execution context
* @param itervar ?
* @param from ?
* @param to ?
* @param incr ?
* @throws InterruptedException if InterruptedException occurs
*/
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws InterruptedException {
LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
* put tasks into queue
* mark end of task input stream
* Step 3) join all threads (wait for finished work)
* Step 4) collect results from each parallel worker
*/
Timing time = new Timing(true);
int numExecutedTasks = 0;
int numExecutedIterations = 0;
// restrict recompilation to thread local memory
setMemoryBudget();
// enable runtime piggybacking if required
if (_enableRuntimePiggybacking)
// default piggybacking worker
RuntimePiggybacking.start(_numThreads);
try {
// Step 1) create task queue and init workers in parallel
// (including preparation of update-in-place variables)
LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
Thread[] threads = new Thread[_numThreads];
LocalParWorker[] workers = new LocalParWorker[_numThreads];
IntStream.range(0, _numThreads).parallel().forEach(i -> {
workers[i] = createParallelWorker(_pwIDs[i], queue, ec, i);
threads[i] = new Thread(workers[i]);
threads[i].setPriority(Thread.MAX_PRIORITY);
});
// start threads (from now on waiting for tasks)
for (Thread thread : threads) thread.start();
// maintain statistics
long tinit = (long) time.stop();
if (DMLScript.STATISTICS)
Statistics.incrementParForInitTime(tinit);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
// put tasks into queue (parworker start work on first tasks while creating tasks)
numCreatedTasks = partitioner.createTasks(queue);
} else {
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
// put tasks into queue
for (Task t : tasks) queue.enqueueTask(t);
// mark end of task input stream
queue.closeInput();
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// Step 3) join all threads (wait for finished work)
for (Thread thread : threads) thread.join();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
// obtain results and cleanup other intermediates before result merge
LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
for (int i = 0; i < _numThreads; i++) {
localVariables[i] = workers[i].getVariables();
localVariables[i].removeAllNotIn(_resultVars.stream().map(v -> v._name).collect(Collectors.toSet()));
numExecutedTasks += workers[i].getExecutedTasks();
numExecutedIterations += workers[i].getExecutedIterations();
}
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
// Step 5) cleanup local parworkers (e.g., remove created functions)
for (int i = 0; i < _numThreads; i++) {
Collection<String> fnNames = workers[i].getFunctionNames();
if (fnNames != null && !fnNames.isEmpty())
for (String fn : fnNames) {
String[] parts = DMLProgram.splitFunctionKey(fn);
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
ec.getGPUContext(0).initializeThread();
}
} finally {
// remove thread-local memory budget (reset to original budget)
// (in finally to prevent error side effects for multiple scripts in one jvm)
resetMemoryBudget();
// disable runtime piggybacking
if (_enableRuntimePiggybacking)
RuntimePiggybacking.stop();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project systemml by apache.
the class ParForProgramBlock method createParallelWorker.
/**
* Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child
* program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced
* anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous
* executions of this parfor.
*
* @param pwID parworker id
* @param queue task queue
* @param ec execution context
* @param index the index of the worker
* @return local parworker
*/
private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index) {
LocalParWorker pw = null;
try {
// create deep copies of required elements child blocks
ArrayList<ProgramBlock> cpChildBlocks = null;
HashSet<String> fnNames = new HashSet<>();
if (USE_PB_CACHE) {
if (_pbcache.containsKey(pwID)) {
cpChildBlocks = _pbcache.get(pwID);
} else {
cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
_pbcache.put(pwID, cpChildBlocks);
}
} else {
cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
}
// deep copy execution context (including prepare parfor update-in-place)
ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
// and sets it in the ExecutionContext of the parfor
if (DMLScript.USE_ACCELERATOR) {
cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
}
// prepare basic update-in-place variables (vars dropped on result merge)
prepareUpdateInPlaceVariables(cpEc, pwID);
// copy compiler configuration (for jmlc w/o global config)
CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
// create the actual parallel worker
ParForBody body = new ParForBody(cpChildBlocks, _resultVars, cpEc);
pw = new LocalParWorker(pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor);
pw.setFunctionNames(fnNames);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
return pw;
}
Aggregations