use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class CompressedMatrixBlock method chainMatrixMultOperations.
@Override
public MatrixBlock chainMatrixMultOperations(MatrixBlock v, MatrixBlock w, MatrixBlock out, ChainType ctype, int k) {
// call uncompressed matrix mult if necessary
if (!isCompressed()) {
return super.chainMatrixMultOperations(v, w, out, ctype, k);
}
// multi-threaded mmchain of single uncompressed colgroup
if (isSingleUncompressedGroup()) {
return ((ColGroupUncompressed) _colGroups.get(0)).getData().chainMatrixMultOperations(v, w, out, ctype, k);
}
Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
// prepare result
if (out != null)
out.reset(clen, 1, false);
else
out = new MatrixBlock(clen, 1, false);
// empty block handling
if (isEmptyBlock(false))
return out;
// compute matrix mult
MatrixBlock tmp = new MatrixBlock(rlen, 1, false);
rightMultByVector(v, tmp, k);
if (ctype == ChainType.XtwXv) {
BinaryOperator bop = new BinaryOperator(Multiply.getMultiplyFnObject());
LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
}
leftMultByVectorTranspose(_colGroups, tmp, out, true, k);
if (LOG.isDebugEnabled())
LOG.debug("Compressed MMChain k=" + k + " in " + time.stop());
return out;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class CompressedMatrixBlock method decompress.
/**
* Decompress block.
*
* @return a new uncompressed matrix block containing the contents of this
* block
*/
public MatrixBlock decompress() {
// early abort for not yet compressed blocks
if (!isCompressed())
return new MatrixBlock(this);
Timing time = new Timing(true);
// preallocation sparse rows to avoid repeated reallocations
MatrixBlock ret = new MatrixBlock(getNumRows(), getNumColumns(), isInSparseFormat(), getNonZeros());
if (ret.isInSparseFormat()) {
int[] rnnz = new int[rlen];
for (ColGroup grp : _colGroups) grp.countNonZerosPerRow(rnnz, 0, rlen);
ret.allocateSparseRowsBlock();
SparseBlock rows = ret.getSparseBlock();
for (int i = 0; i < rlen; i++) rows.allocate(i, rnnz[i]);
}
// core decompression (append if sparse)
for (ColGroup grp : _colGroups) grp.decompressToBlock(ret, 0, rlen);
// post-processing (for append in decompress)
ret.setNonZeros(nonZeros);
if (ret.isInSparseFormat())
ret.sortSparseRows();
if (LOG.isDebugEnabled())
LOG.debug("decompressed block in " + time.stop() + "ms.");
return ret;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteMRParFor.
private void executeRemoteMRParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create and serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = false;
if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_MR))) {
// tid = 0 because replaced in remote parworker
flagForced = checkMRAndRecompileToCP(0);
}
// Step 1) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single
// job, cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String taskFile = constructTaskFileName();
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
int maxDigits = (int) Math.log10(to.getLongValue()) + 1;
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
// put tasks into queue and start writing to taskFile
numCreatedTasks = partitioner.createTasks(queue);
taskFile = writeTasksToFile(taskFile, queue, maxDigits);
} else {
// sequentially create tasks and write to disk
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
taskFile = writeTasksToFile(taskFile, tasks, maxDigits);
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 3) submit MR job (wait for finished work)
MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix != null) ? ec.getMatrixObject(_colocatedDPMatrix) : null;
RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse));
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (// see step 0
flagForced)
releaseForcedRecompile(0);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteMRParForDP.
private void executeRemoteMRParForDP(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create and serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = checkMRAndRecompileToCP(0);
// Step 1) prepare partitioned input matrix (needs to happen before serializing the program)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix);
PartitionFormat inputDPF = sb.determineDataPartitionFormat(_colocatedDPMatrix);
// mark matrix var as partitioned
inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N);
// Step 2) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single
// job, cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 3) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
// partitioner.createTasks().size();
long numCreatedTasks = numIterations;
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 4) submit MR job (wait for finished work)
OutputInfo inputOI = ((inputMatrix.getSparsity() < 0.1 && inputDPF == PartitionFormat.COLUMN_WISE) || (inputMatrix.getSparsity() < 0.001 && inputDPF == PartitionFormat.ROW_WISE)) ? OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo;
RemoteParForJobReturn ret = RemoteDPParForMR.runJob(_ID, _iterPredVar, _colocatedDPMatrix, program, resultFile, inputMatrix, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads, _replicationDP);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 5) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (// see step 0
flagForced)
releaseForcedRecompile(0);
inputMatrix.unsetPartitioned();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class ParForProgramBlock method execute.
@Override
public void execute(ExecutionContext ec) {
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
// evaluate from, to, incr only once (assumption: known at for entry)
IntObject from = executePredicateInstructions(1, _fromInstructions, ec);
IntObject to = executePredicateInstructions(2, _toInstructions, ec);
IntObject incr = (_incrementInstructions == null || _incrementInstructions.isEmpty()) ? new IntObject((from.getLongValue() <= to.getLongValue()) ? 1 : -1) : executePredicateInstructions(3, _incrementInstructions, ec);
if (// would produce infinite loop
incr.getLongValue() == 0)
throw new DMLRuntimeException(this.printBlockErrorLocation() + "Expression for increment " + "of variable '" + _iterPredVar + "' must evaluate to a non-zero value.");
// early exit on num iterations = zero
_numIterations = computeNumIterations(from, to, incr);
if (_numIterations <= 0)
// avoid unnecessary optimization/initialization
return;
// /////
if (_optMode != POptMode.NONE) {
// set optimizer log level
OptimizationWrapper.setLogLevel(_optLogLevel);
// core optimize
OptimizationWrapper.optimize(_optMode, sb, this, ec, _monitor);
}
// /////
// DATA PARTITIONING of read-only parent variables of type (matrix,unpartitioned)
// /////
Timing time = _monitor ? new Timing(true) : null;
// partitioning on demand (note: for fused data partitioning and execute the optimizer set
// the data partitioner to NONE in order to prevent any side effects)
handleDataPartitioning(ec);
// repartitioning of variables for spark cpmm/zipmm in order prevent unnecessary shuffle
handleSparkRepartitioning(ec);
// eager rdd caching of variables for spark in order prevent read/write contention
handleSparkEagerCaching(ec);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_DATA_T, time.stop());
// initialize iter var to form value
IntObject iterVar = new IntObject(from.getLongValue());
// /////
// begin PARALLEL EXECUTION of (PAR)FOR body
// /////
LOG.trace("EXECUTE PARFOR ID = " + _ID + " with mode = " + _execMode + ", numThreads = " + _numThreads + ", taskpartitioner = " + _taskPartitioner);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTHREADS, _numThreads);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKSIZE, _taskSize);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKPARTITIONER, _taskPartitioner.ordinal());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_DATAPARTITIONER, _dataPartitioner.ordinal());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_EXECMODE, _execMode.ordinal());
}
// preserve shared input/result variables of cleanup
ArrayList<String> varList = ec.getVarList();
boolean[] varState = ec.pinVariables(varList);
try {
switch(_execMode) {
case // create parworkers as local threads
LOCAL:
executeLocalParFor(ec, iterVar, from, to, incr);
break;
case // create parworkers as MR tasks (one job per parfor)
REMOTE_MR:
executeRemoteMRParFor(ec, iterVar, from, to, incr);
break;
case // create parworkers as MR tasks (one job per parfor)
REMOTE_MR_DP:
executeRemoteMRParForDP(ec, iterVar, from, to, incr);
break;
case // create parworkers as Spark tasks (one job per parfor)
REMOTE_SPARK:
executeRemoteSparkParFor(ec, iterVar, from, to, incr);
break;
case // create parworkers as Spark tasks (one job per parfor)
REMOTE_SPARK_DP:
executeRemoteSparkParForDP(ec, iterVar, from, to, incr);
break;
default:
throw new DMLRuntimeException("Undefined execution mode: '" + _execMode + "'.");
}
} catch (Exception ex) {
throw new DMLRuntimeException("PARFOR: Failed to execute loop in parallel.", ex);
}
// reset state of shared input/result variables
ec.unpinVariables(varList, varState);
// cleanup unpinned shared variables
cleanupSharedVariables(ec, varState);
// set iteration var to TO value (+ increment) for FOR equivalence
// consistent with for
iterVar = new IntObject(to.getLongValue());
ec.setVariable(_iterPredVar, iterVar);
// we can replace those variables, because partitioning only applied for read-only matrices
for (String var : _variablesDPOriginal.keySet()) {
// cleanup partitioned matrix (if not reused)
if (!_variablesDPReuse.keySet().contains(var))
VariableCPInstruction.processRemoveVariableInstruction(ec, var);
// reset to original matrix
MatrixObject mo = (MatrixObject) _variablesDPOriginal.get(var);
ec.setVariable(var, mo);
}
// print profiling report (only if top-level parfor because otherwise in parallel context)
if (_monitorReport)
LOG.info("\n" + StatisticMonitor.createReport());
// TODO reset of hop parallelism constraint (e.g., ba+*)
for (// release forced exectypes
String dpvar : // release forced exectypes
_variablesDPOriginal.keySet()) ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, dpvar, ec, false);
// release forced exectypes for fused dp/exec
if (_execMode == PExecMode.REMOTE_MR_DP || _execMode == PExecMode.REMOTE_SPARK_DP)
ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, _colocatedDPMatrix, ec, false);
// after release, deletes dp_varnames
resetOptimizerFlags();
// execute exit instructions (usually empty)
executeInstructions(_exitInstructions, ec);
}
Aggregations