use of org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner in project incubator-systemml by apache.
the class ParForProgramBlock method executeLocalParFor.
/**
* Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
* This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
* below for details of the realization.
*
* @param ec execution context
* @param itervar ?
* @param from ?
* @param to ?
* @param incr ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws InterruptedException if InterruptedException occurs
*/
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, InterruptedException {
LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
* put tasks into queue
* mark end of task input stream
* Step 3) join all threads (wait for finished work)
* Step 4) collect results from each parallel worker
*/
Timing time = new Timing(true);
int numExecutedTasks = 0;
int numExecutedIterations = 0;
//restrict recompilation to thread local memory
setMemoryBudget();
//enable runtime piggybacking if required
if (_enableRuntimePiggybacking)
//default piggybacking worker
RuntimePiggybacking.start(_numThreads);
try {
// Step 1) init parallel workers, task queue and threads
LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
Thread[] threads = new Thread[_numThreads];
LocalParWorker[] workers = new LocalParWorker[_numThreads];
for (int i = 0; i < _numThreads; i++) {
//create parallel workers as (lazy) deep copies
//including preparation of update-in-place variables
workers[i] = createParallelWorker(_pwIDs[i], queue, ec);
threads[i] = new Thread(workers[i]);
threads[i].setPriority(Thread.MAX_PRIORITY);
}
// start threads (from now on waiting for tasks)
for (Thread thread : threads) thread.start();
//maintain statistics
long tinit = (long) time.stop();
if (DMLScript.STATISTICS)
Statistics.incrementParForInitTime(tinit);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
//put tasks into queue (parworker start work on first tasks while creating tasks)
numCreatedTasks = partitioner.createTasks(queue);
} else {
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
// put tasks into queue
for (Task t : tasks) queue.enqueueTask(t);
// mark end of task input stream
queue.closeInput();
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// Step 3) join all threads (wait for finished work)
for (Thread thread : threads) thread.join();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
//obtain results
LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
for (int i = 0; i < _numThreads; i++) {
localVariables[i] = workers[i].getVariables();
numExecutedTasks += workers[i].getExecutedTasks();
numExecutedIterations += workers[i].getExecutedIterations();
}
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
// Step 5) cleanup local parworkers (e.g., remove created functions)
for (int i = 0; i < _numThreads; i++) {
Collection<String> fnNames = workers[i].getFunctionNames();
if (fnNames != null && !fnNames.isEmpty())
for (String fn : fnNames) {
String[] parts = DMLProgram.splitFunctionKey(fn);
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
for (int i = 0; i < _numThreads; i++) {
GPUContext gCtx = workers[i].getExecutionContext().getGPUContext();
GPUContextPool.returnToPool(gCtx);
}
ec.setGPUContext(GPUContextPool.getFromPool());
ec.getGPUContext().initializeThread();
}
} finally {
//remove thread-local memory budget (reset to original budget)
//(in finally to prevent error side effects for multiple scripts in one jvm)
resetMemoryBudget();
//disable runtime piggybacking
if (_enableRuntimePiggybacking)
RuntimePiggybacking.stop();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteSparkParFor.
private void executeRemoteSparkParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException {
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = false;
if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_SPARK))) {
//tid = 0 because replaced in remote parworker
flagForced = checkMRAndRecompileToCP(0);
}
// Step 1) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single job,
// cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
HashMap<String, byte[]> clsMap = new HashMap<String, byte[]>();
String program = ProgramConverter.serializeParForBody(body, clsMap);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
//sequentially create tasks as input to parfor job
List<Task> tasks = partitioner.createTasks();
long numCreatedTasks = tasks.size();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
//write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 3) submit Spark parfor job (no lazy evaluation, since collect on result)
//MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix!=null)? (MatrixObject)ec.getVariable(_colocatedDPMatrix) : null;
RemoteParForJobReturn ret = RemoteParForSpark.runJob(_ID, program, clsMap, tasks, ec, _enableCPCaching, _numThreads);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (//see step 0
flagForced)
releaseForcedRecompile(0);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteSparkParForDP.
private void executeRemoteSparkParForDP(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, IOException {
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = checkMRAndRecompileToCP(0);
// Step 1) prepare partitioned input matrix (needs to happen before serializing the progam)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix);
PartitionFormat inputDPF = sb.determineDataPartitionFormat(_colocatedDPMatrix);
//mark matrix var as partitioned
inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N);
// Step 2) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single job,
// cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
HashMap<String, byte[]> clsMap = new HashMap<String, byte[]>();
String program = ProgramConverter.serializeParForBody(body, clsMap);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 3) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
//partitioner.createTasks().size();
long numCreatedTasks = numIterations;
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
//write matrices to HDFS, except DP matrix which is the input to the RemoteDPParForSpark job
exportMatricesToHDFS(ec, _colocatedDPMatrix);
// Step 4) submit MR job (wait for finished work)
//TODO runtime support for binary cell partitioning
//OutputInfo inputOI = ((inputMatrix.getSparsity()<0.1 && inputDPF==PDataPartitionFormat.COLUMN_WISE)||
// (inputMatrix.getSparsity()<0.001 && inputDPF==PDataPartitionFormat.ROW_WISE))?
// OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo;
OutputInfo inputOI = OutputInfo.BinaryBlockOutputInfo;
RemoteParForJobReturn ret = RemoteDPParForSpark.runJob(_ID, itervar.getName(), _colocatedDPMatrix, program, clsMap, resultFile, inputMatrix, ec, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 5) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (//see step 0
flagForced)
releaseForcedRecompile(0);
inputMatrix.unsetPartitioned();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteMRParFor.
private void executeRemoteMRParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create tasks
* serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = false;
if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_MR))) {
//tid = 0 because replaced in remote parworker
flagForced = checkMRAndRecompileToCP(0);
}
// Step 1) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single job,
// cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String taskFile = constructTaskFileName();
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
int maxDigits = (int) Math.log10(to.getLongValue()) + 1;
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
//put tasks into queue and start writing to taskFile
numCreatedTasks = partitioner.createTasks(queue);
taskFile = writeTasksToFile(taskFile, queue, maxDigits);
} else {
//sequentially create tasks and write to disk
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
taskFile = writeTasksToFile(taskFile, tasks, maxDigits);
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
//write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 3) submit MR job (wait for finished work)
MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix != null) ? ec.getMatrixObject(_colocatedDPMatrix) : null;
RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse));
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (//see step 0
flagForced)
releaseForcedRecompile(0);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteMRParForDP.
private void executeRemoteMRParForDP(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create tasks
* serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = checkMRAndRecompileToCP(0);
// Step 1) prepare partitioned input matrix (needs to happen before serializing the program)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix);
PartitionFormat inputDPF = sb.determineDataPartitionFormat(_colocatedDPMatrix);
//mark matrix var as partitioned
inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N);
// Step 2) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single job,
// cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 3) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
//partitioner.createTasks().size();
long numCreatedTasks = numIterations;
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
//write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 4) submit MR job (wait for finished work)
OutputInfo inputOI = ((inputMatrix.getSparsity() < 0.1 && inputDPF == PartitionFormat.COLUMN_WISE) || (inputMatrix.getSparsity() < 0.001 && inputDPF == PartitionFormat.ROW_WISE)) ? OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo;
RemoteParForJobReturn ret = RemoteDPParForMR.runJob(_ID, itervar.getName(), _colocatedDPMatrix, program, resultFile, inputMatrix, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads, _replicationDP);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 5) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
//consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (//see step 0
flagForced)
releaseForcedRecompile(0);
inputMatrix.unsetPartitioned();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
Aggregations