use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class ParForProgramBlock method consolidateAndCheckResults.
private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap[] results) {
Timing time = new Timing(true);
// result merge
if (checkParallelRemoteResultMerge()) {
// execute result merge in parallel for all result vars
int par = Math.min(_resultVars.size(), InfrastructureAnalyzer.getLocalParallelism());
if (InfrastructureAnalyzer.isLocalMode()) {
int parmem = (int) Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer());
// reduce k if necessary
par = Math.min(par, Math.max(parmem, 1));
}
try {
// enqueue all result vars as tasks
LocalTaskQueue<ResultVar> q = new LocalTaskQueue<>();
for (ResultVar var : _resultVars) {
// foreach non-local write
if (// robustness scalars
ec.getVariable(var._name) instanceof MatrixObject)
q.enqueueTask(var);
}
q.closeInput();
// run result merge workers
ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par];
for (int i = 0; i < par; i++) rmWorkers[i] = new ResultMergeWorker(q, results, ec);
for (// start all
int i = 0; // start all
i < par; // start all
i++) rmWorkers[i].start();
for (int i = 0; i < par; i++) {
// wait for all
rmWorkers[i].join();
if (!rmWorkers[i].finishedNoError())
throw new DMLRuntimeException("Error occured in parallel result merge worker.");
}
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
} else {
// execute result merge sequentially for all result vars
for (// foreach non-local write
ResultVar var : // foreach non-local write
_resultVars) {
Data dat = ec.getVariable(var._name);
if (// robustness scalars
dat instanceof MatrixObject) {
MatrixObject out = (MatrixObject) dat;
MatrixObject[] in = new MatrixObject[results.length];
for (int i = 0; i < results.length; i++) in[i] = (MatrixObject) results[i].get(var._name);
String fname = constructResultMergeFileName();
ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, var._isAccum, ec);
MatrixObject outNew = null;
if (USE_PARALLEL_RESULT_MERGE)
outNew = rm.executeParallelMerge(_numThreads);
else
outNew = rm.executeSerialMerge();
// cleanup existing var
Data exdata = ec.removeVariable(var._name);
if (exdata != null && exdata != outNew && exdata instanceof MatrixObject)
ec.cleanupCacheableData((MatrixObject) exdata);
// cleanup of intermediate result variables
cleanWorkerResultVariables(ec, out, in);
// set merged result variable
ec.setVariable(var._name, outNew);
}
}
}
// handle unscoped variables (vars created in parfor, but potentially used afterwards)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
if (// sb might be null for nested parallelism
CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null)
createEmptyUnscopedVariables(ec.getVariables(), sb);
// check expected counters
if (// consistency check
numTasks != expTasks || numIters != expIters)
throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks " + numTasks + "/" + expTasks + ", iters " + numIters + "/" + expIters + ".");
if (DMLScript.STATISTICS)
Statistics.incrementParForMergeTime((long) time.stop());
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class ParForProgramBlock method executeLocalParFor.
/**
* Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
* This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
* below for details of the realization.
*
* @param ec execution context
* @param itervar ?
* @param from ?
* @param to ?
* @param incr ?
* @throws InterruptedException if InterruptedException occurs
*/
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws InterruptedException {
LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
* put tasks into queue
* mark end of task input stream
* Step 3) join all threads (wait for finished work)
* Step 4) collect results from each parallel worker
*/
Timing time = new Timing(true);
int numExecutedTasks = 0;
int numExecutedIterations = 0;
// restrict recompilation to thread local memory
setMemoryBudget();
// enable runtime piggybacking if required
if (_enableRuntimePiggybacking)
// default piggybacking worker
RuntimePiggybacking.start(_numThreads);
try {
// Step 1) create task queue and init workers in parallel
// (including preparation of update-in-place variables)
LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
Thread[] threads = new Thread[_numThreads];
LocalParWorker[] workers = new LocalParWorker[_numThreads];
IntStream.range(0, _numThreads).parallel().forEach(i -> {
workers[i] = createParallelWorker(_pwIDs[i], queue, ec, i);
threads[i] = new Thread(workers[i]);
threads[i].setPriority(Thread.MAX_PRIORITY);
});
// start threads (from now on waiting for tasks)
for (Thread thread : threads) thread.start();
// maintain statistics
long tinit = (long) time.stop();
if (DMLScript.STATISTICS)
Statistics.incrementParForInitTime(tinit);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
long numIterations = partitioner.getNumIterations();
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
// put tasks into queue (parworker start work on first tasks while creating tasks)
numCreatedTasks = partitioner.createTasks(queue);
} else {
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
// put tasks into queue
for (Task t : tasks) queue.enqueueTask(t);
// mark end of task input stream
queue.closeInput();
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// Step 3) join all threads (wait for finished work)
for (Thread thread : threads) thread.join();
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
// obtain results and cleanup other intermediates before result merge
LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
for (int i = 0; i < _numThreads; i++) {
localVariables[i] = workers[i].getVariables();
localVariables[i].removeAllNotIn(_resultVars.stream().map(v -> v._name).collect(Collectors.toSet()));
numExecutedTasks += workers[i].getExecutedTasks();
numExecutedIterations += workers[i].getExecutedIterations();
}
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
// Step 5) cleanup local parworkers (e.g., remove created functions)
for (int i = 0; i < _numThreads; i++) {
Collection<String> fnNames = workers[i].getFunctionNames();
if (fnNames != null && !fnNames.isEmpty())
for (String fn : fnNames) {
String[] parts = DMLProgram.splitFunctionKey(fn);
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
ec.getGPUContext(0).initializeThread();
}
} finally {
// remove thread-local memory budget (reset to original budget)
// (in finally to prevent error side effects for multiple scripts in one jvm)
resetMemoryBudget();
// disable runtime piggybacking
if (_enableRuntimePiggybacking)
RuntimePiggybacking.stop();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class CompressedMatrixBlock method compress.
/**
* Compress block.
*
* @param k number of threads
* @return compressed matrix block or original block if incompressible
*/
public MatrixBlock compress(int k) {
// check for redundant compression
if (isCompressed()) {
throw new DMLRuntimeException("Redundant compression, block already compressed.");
}
Timing time = new Timing(true);
_stats = new CompressionStatistics();
// SAMPLE-BASED DECISIONS:
// Decisions such as testing if a column is amenable to bitmap
// compression or evaluating co-coding potentionls are made based on a
// subset of the rows. For large datasets, sampling might take a
// significant amount of time. So, we generate only one sample and use
// it for the entire compression process.
// prepare basic meta data and deep copy / transpose input
final int numRows = getNumRows();
final int numCols = getNumColumns();
final boolean sparse = isInSparseFormat();
MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) : LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
// construct sample-based size estimator
CompressedSizeEstimator bitmapSizeEstimator = SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
// PHASE 1: Classify columns by compression type
// We start by determining which columns are amenable to compression
List<Integer> colsC = new ArrayList<>();
List<Integer> colsUC = new ArrayList<>();
HashMap<Integer, Double> compRatios = new HashMap<>();
// Classify columns according to ratio (size uncompressed / size compressed),
// where a column is compressible if ratio > 1.
CompressedSizeInfo[] sizeInfos = (k > 1) ? computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) : computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
long nnzUC = 0;
for (int col = 0; col < numCols; col++) {
double uncompSize = getUncompressedSize(numRows, 1, OptimizerUtils.getSparsity(numRows, 1, sizeInfos[col].getEstNnz()));
double compRatio = uncompSize / sizeInfos[col].getMinSize();
if (compRatio > 1) {
colsC.add(col);
compRatios.put(col, compRatio);
} else {
colsUC.add(col);
nnzUC += sizeInfos[col].getEstNnz();
}
}
// correction of column classification (reevaluate dense estimates if necessary)
boolean sparseUC = MatrixBlock.evalSparseFormatInMemory(numRows, colsUC.size(), nnzUC);
if (!sparseUC && !colsUC.isEmpty()) {
for (int i = 0; i < colsUC.size(); i++) {
int col = colsUC.get(i);
double uncompSize = getUncompressedSize(numRows, 1, 1.0);
double compRatio = uncompSize / sizeInfos[col].getMinSize();
if (compRatio > 1) {
colsC.add(col);
colsUC.remove(i);
i--;
compRatios.put(col, compRatio);
nnzUC -= sizeInfos[col].getEstNnz();
}
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("C: " + Arrays.toString(colsC.toArray(new Integer[0])));
LOG.trace("-- compression ratios: " + Arrays.toString(colsC.stream().map(c -> compRatios.get(c)).toArray()));
LOG.trace("UC: " + Arrays.toString(colsUC.toArray(new Integer[0])));
LOG.trace("-- compression ratios: " + Arrays.toString(colsUC.stream().map(c -> compRatios.get(c)).toArray()));
}
if (LOG.isDebugEnabled()) {
_stats.timePhase1 = time.stop();
LOG.debug("Compression statistics:");
LOG.debug("--compression phase 1: " + _stats.timePhase1);
}
if (colsC.isEmpty()) {
if (LOG.isDebugEnabled())
LOG.debug("Abort block compression because all columns are incompressible.");
return new MatrixBlock().copyShallow(this);
}
// PHASE 2: Grouping columns
// Divide the bitmap columns into column groups.
List<int[]> bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(bitmapSizeEstimator, colsC, sizeInfos, numRows, k);
if (LOG.isDebugEnabled()) {
_stats.timePhase2 = time.stop();
LOG.debug("--compression phase 2: " + _stats.timePhase2);
}
if (INVESTIGATE_ESTIMATES) {
double est = 0;
for (int[] groupIndices : bitmapColGrps) est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
est += MatrixBlock.estimateSizeInMemory(numRows, colsUC.size(), OptimizerUtils.getSparsity(numRows, colsUC.size(), nnzUC));
_stats.estSize = est;
}
// PHASE 3: Compress and correct sample-based decisions
ColGroup[] colGroups = (k > 1) ? compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty(), k) : compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty());
allocateColGroupList();
HashSet<Integer> remainingCols = seq(0, numCols - 1, 1);
for (int j = 0; j < colGroups.length; j++) {
if (colGroups[j] != null) {
for (int col : colGroups[j].getColIndices()) remainingCols.remove(col);
_colGroups.add(colGroups[j]);
}
}
if (LOG.isDebugEnabled()) {
_stats.timePhase3 = time.stop();
LOG.debug("--compression phase 3: " + _stats.timePhase3);
}
// PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
double[] dict = createSharedDDC1Dictionary(_colGroups);
if (dict != null) {
applySharedDDC1Dictionary(_colGroups, dict);
_sharedDDC1Dict = true;
}
if (LOG.isDebugEnabled()) {
_stats.timePhase4 = time.stop();
LOG.debug("--compression phase 4: " + _stats.timePhase4);
}
// The remaining columns are stored uncompressed as one big column group
if (!remainingCols.isEmpty()) {
ArrayList<Integer> list = new ArrayList<>(remainingCols);
ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
_colGroups.add(ucgroup);
}
_stats.size = estimateCompressedSizeInMemory();
_stats.ratio = estimateSizeInMemory() / _stats.size;
if (_stats.ratio < 1) {
if (LOG.isDebugEnabled())
LOG.debug("Abort block compression because compression ratio is less than 1.");
return new MatrixBlock().copyShallow(this);
}
// final cleanup (discard uncompressed block)
rawblock.cleanupBlock(true, true);
this.cleanupBlock(true, true);
if (LOG.isDebugEnabled()) {
_stats.timePhase5 = time.stop();
int[] counts = getColGroupCounts(_colGroups);
LOG.debug("--compression phase 5: " + _stats.timePhase5);
LOG.debug("--num col groups: " + _colGroups.size());
LOG.debug("--col groups types (OLE,RLE,DDC1,DDC2,UC): " + counts[2] + "," + counts[1] + "," + counts[3] + "," + counts[4] + "," + counts[0]);
LOG.debug("--col groups sizes (OLE,RLE,DDC1,DDC2,UC): " + counts[7] + "," + counts[6] + "," + counts[8] + "," + counts[9] + "," + counts[5]);
LOG.debug("--compressed size: " + _stats.size);
LOG.debug("--compression ratio: " + _stats.ratio);
}
return this;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class CompressedMatrixBlock method aggregateUnaryOperations.
@Override
public MatrixValue aggregateUnaryOperations(AggregateUnaryOperator op, MatrixValue result, int blockingFactorRow, int blockingFactorCol, MatrixIndexes indexesIn, boolean inCP) {
// call uncompressed matrix mult if necessary
if (!isCompressed()) {
return super.aggregateUnaryOperations(op, result, blockingFactorRow, blockingFactorCol, indexesIn, inCP);
}
// check for supported operations
if (!(op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq || (op.aggOp.increOp.fn instanceof Builtin && (((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MIN || ((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX)))) {
throw new DMLRuntimeException("Unary aggregates other than sum/sumsq/min/max not supported yet.");
}
Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
// prepare output dimensions
CellIndex tempCellIndex = new CellIndex(-1, -1);
op.indexFn.computeDimension(rlen, clen, tempCellIndex);
if (op.aggOp.correctionExists) {
switch(op.aggOp.correctionLocation) {
case LASTROW:
tempCellIndex.row++;
break;
case LASTCOLUMN:
tempCellIndex.column++;
break;
case LASTTWOROWS:
tempCellIndex.row += 2;
break;
case LASTTWOCOLUMNS:
tempCellIndex.column += 2;
break;
default:
throw new DMLRuntimeException("unrecognized correctionLocation: " + op.aggOp.correctionLocation);
}
}
// initialize and allocate the result
if (result == null)
result = new MatrixBlock(tempCellIndex.row, tempCellIndex.column, false);
else
result.reset(tempCellIndex.row, tempCellIndex.column, false);
MatrixBlock ret = (MatrixBlock) result;
ret.allocateDenseBlock();
// special handling init value for rowmins/rowmax
if (op.indexFn instanceof ReduceCol && op.aggOp.increOp.fn instanceof Builtin) {
double val = (((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX) ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
ret.getDenseBlock().set(val);
}
// core unary aggregate
if (op.getNumThreads() > 1 && getExactSizeOnDisk() > MIN_PAR_AGG_THRESHOLD) {
// multi-threaded execution of all groups
ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning((op.indexFn instanceof ReduceCol) ? 1 : op.getNumThreads(), false);
ColGroupUncompressed uc = getUncompressedColGroup();
try {
// compute uncompressed column group in parallel (otherwise bottleneck)
if (uc != null)
uc.unaryAggregateOperations(op, ret);
// compute all compressed column groups
ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
ArrayList<UnaryAggregateTask> tasks = new ArrayList<>();
if (op.indexFn instanceof ReduceCol && grpParts.length > 0) {
int blklen = BitmapEncoder.getAlignedBlocksize((int) (Math.ceil((double) rlen / op.getNumThreads())));
for (int i = 0; i < op.getNumThreads() & i * blklen < rlen; i++) tasks.add(new UnaryAggregateTask(grpParts[0], ret, i * blklen, Math.min((i + 1) * blklen, rlen), op));
} else
for (ArrayList<ColGroup> grp : grpParts) tasks.add(new UnaryAggregateTask(grp, ret, 0, rlen, op));
List<Future<MatrixBlock>> rtasks = pool.invokeAll(tasks);
pool.shutdown();
// aggregate partial results
if (op.indexFn instanceof ReduceAll) {
if (op.aggOp.increOp.fn instanceof KahanFunction) {
KahanObject kbuff = new KahanObject(ret.quickGetValue(0, 0), 0);
for (Future<MatrixBlock> rtask : rtasks) {
double tmp = rtask.get().quickGetValue(0, 0);
((KahanFunction) op.aggOp.increOp.fn).execute2(kbuff, tmp);
}
ret.quickSetValue(0, 0, kbuff._sum);
} else {
double val = ret.quickGetValue(0, 0);
for (Future<MatrixBlock> rtask : rtasks) {
double tmp = rtask.get().quickGetValue(0, 0);
val = op.aggOp.increOp.fn.execute(val, tmp);
}
ret.quickSetValue(0, 0, val);
}
}
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
} else {
// process UC column group
for (ColGroup grp : _colGroups) if (grp instanceof ColGroupUncompressed)
grp.unaryAggregateOperations(op, ret);
// process OLE/RLE column groups
aggregateUnaryOperations(op, _colGroups, ret, 0, rlen);
}
// special handling zeros for rowmins/rowmax
if (op.indexFn instanceof ReduceCol && op.aggOp.increOp.fn instanceof Builtin) {
int[] rnnz = new int[rlen];
for (ColGroup grp : _colGroups) grp.countNonZerosPerRow(rnnz, 0, rlen);
Builtin builtin = (Builtin) op.aggOp.increOp.fn;
for (int i = 0; i < rlen; i++) if (rnnz[i] < clen)
ret.quickSetValue(i, 0, builtin.execute2(ret.quickGetValue(i, 0), 0));
}
// drop correction if necessary
if (op.aggOp.correctionExists && inCP)
ret.dropLastRowsOrColumns(op.aggOp.correctionLocation);
// post-processing
ret.recomputeNonZeros();
if (LOG.isDebugEnabled())
LOG.debug("Compressed uagg k=" + op.getNumThreads() + " in " + time.stop());
return ret;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.
the class CompressedMatrixBlock method transposeSelfMatrixMultOperations.
@Override
public MatrixBlock transposeSelfMatrixMultOperations(MatrixBlock out, MMTSJType tstype, int k) {
// call uncompressed matrix mult if necessary
if (!isCompressed()) {
return super.transposeSelfMatrixMultOperations(out, tstype, k);
}
// multi-threaded tsmm of single uncompressed colgroup
if (isSingleUncompressedGroup()) {
return ((ColGroupUncompressed) _colGroups.get(0)).getData().transposeSelfMatrixMultOperations(out, tstype, k);
}
Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
// check for transpose type
if (// right not supported yet
tstype != MMTSJType.LEFT)
throw new DMLRuntimeException("Invalid MMTSJ type '" + tstype.toString() + "'.");
// create output matrix block
if (out == null)
out = new MatrixBlock(clen, clen, false);
else
out.reset(clen, clen, false);
out.allocateDenseBlock();
if (!isEmptyBlock(false)) {
// compute matrix mult
try {
ExecutorService pool = CommonThreadPool.get(k);
ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<>();
int numgrp = _colGroups.size();
int blklen = (int) (Math.ceil((double) numgrp / (2 * k)));
for (int i = 0; i < 2 * k & i * blklen < clen; i++) tasks.add(new MatrixMultTransposeTask(_colGroups, out, i * blklen, Math.min((i + 1) * blklen, numgrp)));
List<Future<Object>> ret = pool.invokeAll(tasks);
for (Future<Object> tret : ret) // check for errors
tret.get();
pool.shutdown();
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// post-processing
out.setNonZeros(LinearAlgebraUtils.copyUpperToLowerTriangle(out));
}
if (LOG.isDebugEnabled())
LOG.debug("Compressed TSMM k=" + k + " in " + time.stop());
return out;
}
Aggregations