use of org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue in project incubator-systemml by apache.
the class ParForProgramBlock method consolidateAndCheckResults.
private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap[] results) throws DMLRuntimeException {
Timing time = new Timing(true);
//result merge
if (checkParallelRemoteResultMerge()) {
//execute result merge in parallel for all result vars
int par = Math.min(_resultVars.size(), InfrastructureAnalyzer.getLocalParallelism());
if (InfrastructureAnalyzer.isLocalMode()) {
int parmem = (int) Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer());
//reduce k if necessary
par = Math.min(par, Math.max(parmem, 1));
}
try {
//enqueue all result vars as tasks
LocalTaskQueue<String> q = new LocalTaskQueue<String>();
for (//foreach non-local write
String var : //foreach non-local write
_resultVars) if (//robustness scalars
ec.getVariable(var) instanceof MatrixObject)
q.enqueueTask(var);
q.closeInput();
//run result merge workers
ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par];
for (int i = 0; i < par; i++) rmWorkers[i] = new ResultMergeWorker(q, results, ec);
for (//start all
int i = 0; //start all
i < par; //start all
i++) rmWorkers[i].start();
for (int i = 0; i < par; i++) {
//wait for all
rmWorkers[i].join();
if (!rmWorkers[i].finishedNoError())
throw new DMLRuntimeException("Error occured in parallel result merge worker.");
}
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
} else {
//execute result merge sequentially for all result vars
for (//foreach non-local write
String var : //foreach non-local write
_resultVars) {
Data dat = ec.getVariable(var);
if (//robustness scalars
dat instanceof MatrixObject) {
MatrixObject out = (MatrixObject) dat;
MatrixObject[] in = new MatrixObject[results.length];
for (int i = 0; i < results.length; i++) in[i] = (MatrixObject) results[i].get(var);
String fname = constructResultMergeFileName();
ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, ec);
MatrixObject outNew = null;
if (USE_PARALLEL_RESULT_MERGE)
outNew = rm.executeParallelMerge(_numThreads);
else
outNew = rm.executeSerialMerge();
//cleanup existing var
Data exdata = ec.removeVariable(var);
if (exdata != null && exdata != outNew && exdata instanceof MatrixObject)
ec.cleanupMatrixObject((MatrixObject) exdata);
//cleanup of intermediate result variables
cleanWorkerResultVariables(ec, out, in);
//set merged result variable
ec.setVariable(var, outNew);
}
}
}
//handle unscoped variables (vars created in parfor, but potentially used afterwards)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
if (//sb might be null for nested parallelism
CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null)
createEmptyUnscopedVariables(ec.getVariables(), sb);
//check expected counters
if (//consistency check
numTasks != expTasks || numIters != expIters)
throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks " + numTasks + "/" + expTasks + ", iters " + numIters + "/" + expIters + ".");
if (DMLScript.STATISTICS)
Statistics.incrementParForMergeTime((long) time.stop());
}
use of org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue in project incubator-systemml by apache.
the class ParForProgramBlock method executeLocalParFor.
/**
* Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
* This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
* below for details of the realization.
*
* @param ec execution context
* @param itervar ?
* @param from ?
* @param to ?
* @param incr ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws InterruptedException if InterruptedException occurs
*/
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, InterruptedException {
LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
* put tasks into queue
* mark end of task input stream
* Step 3) join all threads (wait for finished work)
* Step 4) collect results from each parallel worker
*/
Timing time = new Timing(true);
int numExecutedTasks = 0;
int numExecutedIterations = 0;
//restrict recompilation to thread local memory
setMemoryBudget();
//enable runtime piggybacking if required
if (_enableRuntimePiggybacking)
//default piggybacking worker
RuntimePiggybacking.start(_numThreads);
try {
// Step 1) init parallel workers, task queue and threads
LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
Thread[] threads = new Thread[_numThreads];
LocalParWorker[] workers = new LocalParWorker[_numThreads];
for (int i = 0; i < _numThreads; i++) {
//create parallel workers as (lazy) deep copies
//including preparation of update-in-place variables
workers[i] = createParallelWorker(_pwIDs[i], queue, ec);
threads[i] = new Thread(workers[i]);
threads[i].setPriority(Thread.MAX_PRIORITY);
}
// start threads (from now on waiting for tasks)
for (Thread thread : threads) thread.start();
//maintain statistics
long tinit = (long) time.stop();
if (DMLScript.STATISTICS)
Statistics.incrementParForInitTime(tinit);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
//put tasks into queue (parworker start work on first tasks while creating tasks)
numCreatedTasks = partitioner.createTasks(queue);
} else {
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
// put tasks into queue
for (Task t : tasks) queue.enqueueTask(t);
// mark end of task input stream
queue.closeInput();
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// Step 3) join all threads (wait for finished work)
for (Thread thread : threads) thread.join();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
//obtain results
LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
for (int i = 0; i < _numThreads; i++) {
localVariables[i] = workers[i].getVariables();
numExecutedTasks += workers[i].getExecutedTasks();
numExecutedIterations += workers[i].getExecutedIterations();
}
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
// Step 5) cleanup local parworkers (e.g., remove created functions)
for (int i = 0; i < _numThreads; i++) {
Collection<String> fnNames = workers[i].getFunctionNames();
if (fnNames != null && !fnNames.isEmpty())
for (String fn : fnNames) {
String[] parts = DMLProgram.splitFunctionKey(fn);
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
for (int i = 0; i < _numThreads; i++) {
GPUContext gCtx = workers[i].getExecutionContext().getGPUContext();
GPUContextPool.returnToPool(gCtx);
}
ec.setGPUContext(GPUContextPool.getFromPool());
ec.getGPUContext().initializeThread();
}
} finally {
//remove thread-local memory budget (reset to original budget)
//(in finally to prevent error side effects for multiple scripts in one jvm)
resetMemoryBudget();
//disable runtime piggybacking
if (_enableRuntimePiggybacking)
RuntimePiggybacking.stop();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteMRParFor.
private void executeRemoteMRParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create tasks
* serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = false;
if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_MR))) {
//tid = 0 because replaced in remote parworker
flagForced = checkMRAndRecompileToCP(0);
}
// Step 1) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single job,
// cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String taskFile = constructTaskFileName();
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
int maxDigits = (int) Math.log10(to.getLongValue()) + 1;
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
//put tasks into queue and start writing to taskFile
numCreatedTasks = partitioner.createTasks(queue);
taskFile = writeTasksToFile(taskFile, queue, maxDigits);
} else {
//sequentially create tasks and write to disk
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
taskFile = writeTasksToFile(taskFile, tasks, maxDigits);
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
//write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 3) submit MR job (wait for finished work)
MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix != null) ? ec.getMatrixObject(_colocatedDPMatrix) : null;
RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse));
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (//see step 0
flagForced)
releaseForcedRecompile(0);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
Aggregations