use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class ParForProgramBlock method consolidateAndCheckResults.
private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap[] results) {
Timing time = new Timing(true);
// result merge
if (checkParallelRemoteResultMerge()) {
// execute result merge in parallel for all result vars
int par = Math.min(_resultVars.size(), InfrastructureAnalyzer.getLocalParallelism());
if (InfrastructureAnalyzer.isLocalMode()) {
int parmem = (int) Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer());
// reduce k if necessary
par = Math.min(par, Math.max(parmem, 1));
}
try {
// enqueue all result vars as tasks
LocalTaskQueue<ResultVar> q = new LocalTaskQueue<>();
for (ResultVar var : _resultVars) {
// foreach non-local write
if (// robustness scalars
ec.getVariable(var._name) instanceof MatrixObject)
q.enqueueTask(var);
}
q.closeInput();
// run result merge workers
ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par];
for (int i = 0; i < par; i++) rmWorkers[i] = new ResultMergeWorker(q, results, ec);
for (// start all
int i = 0; // start all
i < par; // start all
i++) rmWorkers[i].start();
for (int i = 0; i < par; i++) {
// wait for all
rmWorkers[i].join();
if (!rmWorkers[i].finishedNoError())
throw new DMLRuntimeException("Error occured in parallel result merge worker.");
}
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
} else {
// execute result merge sequentially for all result vars
for (// foreach non-local write
ResultVar var : // foreach non-local write
_resultVars) {
Data dat = ec.getVariable(var._name);
if (// robustness scalars
dat instanceof MatrixObject) {
MatrixObject out = (MatrixObject) dat;
MatrixObject[] in = new MatrixObject[results.length];
for (int i = 0; i < results.length; i++) in[i] = (MatrixObject) results[i].get(var._name);
String fname = constructResultMergeFileName();
ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, var._isAccum, ec);
MatrixObject outNew = null;
if (USE_PARALLEL_RESULT_MERGE)
outNew = rm.executeParallelMerge(_numThreads);
else
outNew = rm.executeSerialMerge();
// cleanup existing var
Data exdata = ec.removeVariable(var._name);
if (exdata != null && exdata != outNew && exdata instanceof MatrixObject)
ec.cleanupCacheableData((MatrixObject) exdata);
// cleanup of intermediate result variables
cleanWorkerResultVariables(ec, out, in);
// set merged result variable
ec.setVariable(var._name, outNew);
}
}
}
// handle unscoped variables (vars created in parfor, but potentially used afterwards)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
if (// sb might be null for nested parallelism
CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null)
createEmptyUnscopedVariables(ec.getVariables(), sb);
// check expected counters
if (// consistency check
numTasks != expTasks || numIters != expIters)
throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks " + numTasks + "/" + expTasks + ", iters " + numIters + "/" + expIters + ".");
if (DMLScript.STATISTICS)
Statistics.incrementParForMergeTime((long) time.stop());
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class ParForProgramBlock method execute.
@Override
public void execute(ExecutionContext ec) {
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
// evaluate from, to, incr only once (assumption: known at for entry)
IntObject from = executePredicateInstructions(1, _fromInstructions, ec);
IntObject to = executePredicateInstructions(2, _toInstructions, ec);
IntObject incr = (_incrementInstructions == null || _incrementInstructions.isEmpty()) ? new IntObject((from.getLongValue() <= to.getLongValue()) ? 1 : -1) : executePredicateInstructions(3, _incrementInstructions, ec);
if (// would produce infinite loop
incr.getLongValue() == 0)
throw new DMLRuntimeException(this.printBlockErrorLocation() + "Expression for increment " + "of variable '" + _iterPredVar + "' must evaluate to a non-zero value.");
// early exit on num iterations = zero
_numIterations = computeNumIterations(from, to, incr);
if (_numIterations <= 0)
// avoid unnecessary optimization/initialization
return;
// /////
if (_optMode != POptMode.NONE) {
// set optimizer log level
OptimizationWrapper.setLogLevel(_optLogLevel);
// core optimize
OptimizationWrapper.optimize(_optMode, sb, this, ec, _monitor);
}
// /////
// DATA PARTITIONING of read-only parent variables of type (matrix,unpartitioned)
// /////
Timing time = _monitor ? new Timing(true) : null;
// partitioning on demand (note: for fused data partitioning and execute the optimizer set
// the data partitioner to NONE in order to prevent any side effects)
handleDataPartitioning(ec);
// repartitioning of variables for spark cpmm/zipmm in order prevent unnecessary shuffle
handleSparkRepartitioning(ec);
// eager rdd caching of variables for spark in order prevent read/write contention
handleSparkEagerCaching(ec);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_DATA_T, time.stop());
// initialize iter var to form value
IntObject iterVar = new IntObject(from.getLongValue());
// /////
// begin PARALLEL EXECUTION of (PAR)FOR body
// /////
LOG.trace("EXECUTE PARFOR ID = " + _ID + " with mode = " + _execMode + ", numThreads = " + _numThreads + ", taskpartitioner = " + _taskPartitioner);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTHREADS, _numThreads);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKSIZE, _taskSize);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKPARTITIONER, _taskPartitioner.ordinal());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_DATAPARTITIONER, _dataPartitioner.ordinal());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_EXECMODE, _execMode.ordinal());
}
// preserve shared input/result variables of cleanup
ArrayList<String> varList = ec.getVarList();
boolean[] varState = ec.pinVariables(varList);
try {
switch(_execMode) {
case // create parworkers as local threads
LOCAL:
executeLocalParFor(ec, iterVar, from, to, incr);
break;
case // create parworkers as MR tasks (one job per parfor)
REMOTE_MR:
executeRemoteMRParFor(ec, iterVar, from, to, incr);
break;
case // create parworkers as MR tasks (one job per parfor)
REMOTE_MR_DP:
executeRemoteMRParForDP(ec, iterVar, from, to, incr);
break;
case // create parworkers as Spark tasks (one job per parfor)
REMOTE_SPARK:
executeRemoteSparkParFor(ec, iterVar, from, to, incr);
break;
case // create parworkers as Spark tasks (one job per parfor)
REMOTE_SPARK_DP:
executeRemoteSparkParForDP(ec, iterVar, from, to, incr);
break;
default:
throw new DMLRuntimeException("Undefined execution mode: '" + _execMode + "'.");
}
} catch (Exception ex) {
throw new DMLRuntimeException("PARFOR: Failed to execute loop in parallel.", ex);
}
// reset state of shared input/result variables
ec.unpinVariables(varList, varState);
// cleanup unpinned shared variables
cleanupSharedVariables(ec, varState);
// set iteration var to TO value (+ increment) for FOR equivalence
// consistent with for
iterVar = new IntObject(to.getLongValue());
ec.setVariable(_iterPredVar, iterVar);
// we can replace those variables, because partitioning only applied for read-only matrices
for (String var : _variablesDPOriginal.keySet()) {
// cleanup partitioned matrix (if not reused)
if (!_variablesDPReuse.keySet().contains(var))
VariableCPInstruction.processRemoveVariableInstruction(ec, var);
// reset to original matrix
MatrixObject mo = (MatrixObject) _variablesDPOriginal.get(var);
ec.setVariable(var, mo);
}
// print profiling report (only if top-level parfor because otherwise in parallel context)
if (_monitorReport)
LOG.info("\n" + StatisticMonitor.createReport());
// TODO reset of hop parallelism constraint (e.g., ba+*)
for (// release forced exectypes
String dpvar : // release forced exectypes
_variablesDPOriginal.keySet()) ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, dpvar, ec, false);
// release forced exectypes for fused dp/exec
if (_execMode == PExecMode.REMOTE_MR_DP || _execMode == PExecMode.REMOTE_SPARK_DP)
ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, _colocatedDPMatrix, ec, false);
// after release, deletes dp_varnames
resetOptimizerFlags();
// execute exit instructions (usually empty)
executeInstructions(_exitInstructions, ec);
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class ParForProgramBlock method handleDataPartitioning.
private void handleDataPartitioning(ExecutionContext ec) {
PDataPartitioner dataPartitioner = _dataPartitioner;
if (dataPartitioner != PDataPartitioner.NONE) {
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
if (sb == null)
throw new DMLRuntimeException("ParFor statement block required for reasoning about data partitioning.");
for (String var : sb.getReadOnlyParentVars()) {
Data dat = ec.getVariable(var);
// partitioning but typically related branches are never executed)
if (dat != null && dat instanceof MatrixObject) {
// unpartitioned input
MatrixObject moVar = (MatrixObject) dat;
PartitionFormat dpf = sb.determineDataPartitionFormat(var);
LOG.trace("PARFOR ID = " + _ID + ", Partitioning read-only input variable " + var + " (format=" + dpf + ", mode=" + _dataPartitioner + ")");
if (dpf != PartitionFormat.NONE) {
if (dataPartitioner != PDataPartitioner.REMOTE_SPARK && dpf.isBlockwise()) {
LOG.warn("PARFOR ID = " + _ID + ", Switching data partitioner from " + dataPartitioner + " to " + PDataPartitioner.REMOTE_SPARK.name() + " for blockwise-n partitioning.");
dataPartitioner = PDataPartitioner.REMOTE_SPARK;
}
Timing ltime = new Timing(true);
// input data partitioning (reuse if possible)
Data dpdatNew = _variablesDPReuse.get(var);
if (// no reuse opportunity
dpdatNew == null) {
DataPartitioner dp = createDataPartitioner(dpf, dataPartitioner, ec);
// disable binary cell for sparse if consumed by MR jobs
if (!OptimizerRuleBased.allowsBinaryCellPartitions(moVar, dpf) || // TODO support for binarycell
OptimizerUtils.isSparkExecutionMode()) {
dp.disableBinaryCell();
}
MatrixObject moVarNew = dp.createPartitionedMatrixObject(moVar, constructDataPartitionsFileName());
dpdatNew = moVarNew;
// skip remaining partitioning logic if not partitioned (e.g., too small)
if (moVar == moVarNew)
// skip to next
continue;
}
ec.setVariable(var, dpdatNew);
// recompile parfor body program
ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, var, ec, true);
// store original and partitioned matrix (for reuse if applicable)
_variablesDPOriginal.put(var, moVar);
if (ALLOW_REUSE_PARTITION_VARS && ProgramRecompiler.isApplicableForReuseVariable(sb.getDMLProg(), sb, var)) {
_variablesDPReuse.put(var, dpdatNew);
}
LOG.trace("Partitioning and recompilation done in " + ltime.stop() + "ms");
}
}
}
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class LocalParWorker method run.
@Override
public void run() {
// monitoring start
Timing time1 = (_monitor ? new Timing(true) : null);
// spark context creation (if data cached already created)
if (OptimizerUtils.isSparkExecutionMode() && SparkExecutionContext.isSparkContextCreated()) {
SparkExecutionContext sec = (SparkExecutionContext) _ec;
sec.setThreadLocalSchedulerPool("parforPool" + _workerID);
}
// Initialize this GPUContext to this thread
if (DMLScript.USE_ACCELERATOR) {
try {
_ec.getGPUContext(0).initializeThread();
} catch (DMLRuntimeException e) {
LOG.error("Error executing task because of failure in GPU backend: ", e);
LOG.error("Stopping LocalParWorker.");
return;
}
}
// setup compiler config for worker thread
ConfigurationManager.setLocalConfig(_cconf);
// continuous execution (execute tasks until (1) stopped or (2) no more tasks)
Task lTask = null;
while (!_stopped) {
// dequeue the next task (abort on NO_MORE_TASKS or error)
try {
lTask = _taskQueue.dequeueTask();
if (// task queue closed (no more tasks)
lTask == LocalTaskQueue.NO_MORE_TASKS)
// normal end of parallel worker
break;
} catch (Exception ex) {
// abort on taskqueue error
LOG.warn("Error reading from task queue: " + ex.getMessage());
LOG.warn("Stopping LocalParWorker.");
// no exception thrown to prevent blocking on join
break;
}
// execute the task sequentially (re-try on error)
boolean success = false;
int retrys = _max_retry;
while (!success) {
try {
// /////
// core execution (see ParWorker)
executeTask(lTask);
success = true;
} catch (Exception ex) {
LOG.error("Failed to execute " + lTask.toString() + ", retry:" + retrys, ex);
if (retrys > 0)
// retry on task error
retrys--;
else {
// abort on no remaining retrys
LOG.error("Error executing task: ", ex);
LOG.error("Stopping LocalParWorker.");
// no exception thrown to prevent blocking on join
break;
}
}
}
}
// setup fair scheduler pool for worker thread
if (OptimizerUtils.isSparkExecutionMode() && SparkExecutionContext.isSparkContextCreated()) {
SparkExecutionContext sec = (SparkExecutionContext) _ec;
sec.cleanupThreadLocalSchedulerPool();
}
if (_monitor) {
StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_NUMTASKS, _numTasks);
StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_NUMITERS, _numIters);
StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_EXEC_T, time1.stop());
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class CompressedMatrixBlock method aggregateBinaryOperations.
@Override
public MatrixBlock aggregateBinaryOperations(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, AggregateBinaryOperator op) {
// call uncompressed matrix mult if necessary
if (!isCompressed()) {
return super.aggregateBinaryOperations(m1, m2, ret, op);
}
// multi-threaded mm of single uncompressed colgroup
if (isSingleUncompressedGroup()) {
MatrixBlock tmp = ((ColGroupUncompressed) _colGroups.get(0)).getData();
return tmp.aggregateBinaryOperations(this == m1 ? tmp : m1, this == m2 ? tmp : m2, ret, op);
}
Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
// setup meta data (dimensions, sparsity)
int rl = m1.getNumRows();
int cl = m2.getNumColumns();
// create output matrix block
if (ret == null)
ret = new MatrixBlock(rl, cl, false, rl * cl);
else
ret.reset(rl, cl, false, rl * cl);
// compute matrix mult
if (m1.getNumRows() > 1 && m2.getNumColumns() == 1) {
// MV right
CompressedMatrixBlock cmb = (CompressedMatrixBlock) m1;
if (op.getNumThreads() > 1)
cmb.rightMultByVector(m2, ret, op.getNumThreads());
else
cmb.rightMultByVector(m2, ret);
} else if (m1.getNumRows() == 1 && m2.getNumColumns() > 1) {
// MV left
if (op.getNumThreads() > 1)
leftMultByVectorTranspose(_colGroups, m1, ret, false, op.getNumThreads());
else
leftMultByVectorTranspose(_colGroups, m1, ret, false, true);
} else {
// MM
// prepare the other input (including decompression if necessary)
boolean right = (m1 == this);
MatrixBlock that = right ? m2 : m1;
if (that instanceof CompressedMatrixBlock) {
that = ((CompressedMatrixBlock) that).isCompressed() ? ((CompressedMatrixBlock) that).decompress() : that;
}
// transpose for sequential repeated column access
if (right) {
that = LibMatrixReorg.transpose(that, new MatrixBlock(that.getNumColumns(), that.getNumRows(), that.isInSparseFormat()), op.getNumThreads());
}
MatrixBlock tmpIn = new MatrixBlock(1, that.getNumColumns(), false).allocateBlock();
MatrixBlock tmpOut = new MatrixBlock(right ? rl : 1, right ? 1 : cl, false).allocateBlock();
if (right) {
// MM right
for (int i = 0; i < that.getNumRows(); i++) {
// on transpose
tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
MatrixBlock tmpIn2 = // meta data op
LibMatrixReorg.transpose(// meta data op
tmpIn, new MatrixBlock(tmpIn.getNumColumns(), tmpIn.getNumRows(), false));
tmpOut.reset(tmpOut.getNumRows(), tmpOut.getNumColumns());
if (op.getNumThreads() > 1)
rightMultByVector(tmpIn2, tmpOut, op.getNumThreads());
else
rightMultByVector(tmpIn2, tmpOut);
ret.leftIndexingOperations(tmpOut, 0, ret.getNumRows() - 1, i, i, ret, UpdateType.INPLACE);
}
} else {
// MM left
for (int i = 0; i < that.getNumRows(); i++) {
tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
if (op.getNumThreads() > 1)
leftMultByVectorTranspose(_colGroups, tmpIn, tmpOut, false, op.getNumThreads());
else
leftMultByVectorTranspose(_colGroups, tmpIn, tmpOut, false, true);
ret.leftIndexingOperations(tmpOut, i, i, 0, ret.getNumColumns() - 1, ret, UpdateType.INPLACE);
}
}
}
if (LOG.isDebugEnabled())
LOG.debug("Compressed MM in " + time.stop());
return ret;
}
Aggregations