use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class CompressedMatrixBlock method chainMatrixMultOperations.
@Override
public MatrixBlock chainMatrixMultOperations(MatrixBlock v, MatrixBlock w, MatrixBlock out, ChainType ctype, int k) {
// call uncompressed matrix mult if necessary
if (!isCompressed()) {
return super.chainMatrixMultOperations(v, w, out, ctype, k);
}
// multi-threaded mmchain of single uncompressed colgroup
if (isSingleUncompressedGroup()) {
return ((ColGroupUncompressed) _colGroups.get(0)).getData().chainMatrixMultOperations(v, w, out, ctype, k);
}
Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
// prepare result
if (out != null)
out.reset(clen, 1, false);
else
out = new MatrixBlock(clen, 1, false);
// empty block handling
if (isEmptyBlock(false))
return out;
// compute matrix mult
MatrixBlock tmp = new MatrixBlock(rlen, 1, false);
rightMultByVector(v, tmp, k);
if (ctype == ChainType.XtwXv) {
BinaryOperator bop = new BinaryOperator(Multiply.getMultiplyFnObject());
LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
}
leftMultByVectorTranspose(_colGroups, tmp, out, true, k);
if (LOG.isDebugEnabled())
LOG.debug("Compressed MMChain k=" + k + " in " + time.stop());
return out;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class CompressedMatrixBlock method compress.
/**
* Compress block.
*
* @param k number of threads
* @return compressed matrix block or original block if incompressible
*/
public MatrixBlock compress(int k) {
// check for redundant compression
if (isCompressed()) {
throw new DMLRuntimeException("Redundant compression, block already compressed.");
}
Timing time = new Timing(true);
_stats = new CompressionStatistics();
// SAMPLE-BASED DECISIONS:
// Decisions such as testing if a column is amenable to bitmap
// compression or evaluating co-coding potentionls are made based on a
// subset of the rows. For large datasets, sampling might take a
// significant amount of time. So, we generate only one sample and use
// it for the entire compression process.
// prepare basic meta data and deep copy / transpose input
final int numRows = getNumRows();
final int numCols = getNumColumns();
final boolean sparse = isInSparseFormat();
MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) : LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
// construct sample-based size estimator
CompressedSizeEstimator bitmapSizeEstimator = SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
// PHASE 1: Classify columns by compression type
// We start by determining which columns are amenable to compression
List<Integer> colsC = new ArrayList<>();
List<Integer> colsUC = new ArrayList<>();
HashMap<Integer, Double> compRatios = new HashMap<>();
// Classify columns according to ratio (size uncompressed / size compressed),
// where a column is compressible if ratio > 1.
CompressedSizeInfo[] sizeInfos = (k > 1) ? computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) : computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
long nnzUC = 0;
for (int col = 0; col < numCols; col++) {
double uncompSize = getUncompressedSize(numRows, 1, OptimizerUtils.getSparsity(numRows, 1, sizeInfos[col].getEstNnz()));
double compRatio = uncompSize / sizeInfos[col].getMinSize();
if (compRatio > 1) {
colsC.add(col);
compRatios.put(col, compRatio);
} else {
colsUC.add(col);
nnzUC += sizeInfos[col].getEstNnz();
}
}
// correction of column classification (reevaluate dense estimates if necessary)
boolean sparseUC = MatrixBlock.evalSparseFormatInMemory(numRows, colsUC.size(), nnzUC);
if (!sparseUC && !colsUC.isEmpty()) {
for (int i = 0; i < colsUC.size(); i++) {
int col = colsUC.get(i);
double uncompSize = getUncompressedSize(numRows, 1, 1.0);
double compRatio = uncompSize / sizeInfos[col].getMinSize();
if (compRatio > 1) {
colsC.add(col);
colsUC.remove(i);
i--;
compRatios.put(col, compRatio);
nnzUC -= sizeInfos[col].getEstNnz();
}
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("C: " + Arrays.toString(colsC.toArray(new Integer[0])));
LOG.trace("-- compression ratios: " + Arrays.toString(colsC.stream().map(c -> compRatios.get(c)).toArray()));
LOG.trace("UC: " + Arrays.toString(colsUC.toArray(new Integer[0])));
LOG.trace("-- compression ratios: " + Arrays.toString(colsUC.stream().map(c -> compRatios.get(c)).toArray()));
}
if (LOG.isDebugEnabled()) {
_stats.timePhase1 = time.stop();
LOG.debug("Compression statistics:");
LOG.debug("--compression phase 1: " + _stats.timePhase1);
}
if (colsC.isEmpty()) {
if (LOG.isDebugEnabled())
LOG.debug("Abort block compression because all columns are incompressible.");
return new MatrixBlock().copyShallow(this);
}
// PHASE 2: Grouping columns
// Divide the bitmap columns into column groups.
List<int[]> bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(bitmapSizeEstimator, colsC, sizeInfos, numRows, k);
if (LOG.isDebugEnabled()) {
_stats.timePhase2 = time.stop();
LOG.debug("--compression phase 2: " + _stats.timePhase2);
}
if (INVESTIGATE_ESTIMATES) {
double est = 0;
for (int[] groupIndices : bitmapColGrps) est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
est += MatrixBlock.estimateSizeInMemory(numRows, colsUC.size(), OptimizerUtils.getSparsity(numRows, colsUC.size(), nnzUC));
_stats.estSize = est;
}
// PHASE 3: Compress and correct sample-based decisions
ColGroup[] colGroups = (k > 1) ? compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty(), k) : compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty());
allocateColGroupList();
HashSet<Integer> remainingCols = seq(0, numCols - 1, 1);
for (int j = 0; j < colGroups.length; j++) {
if (colGroups[j] != null) {
for (int col : colGroups[j].getColIndices()) remainingCols.remove(col);
_colGroups.add(colGroups[j]);
}
}
if (LOG.isDebugEnabled()) {
_stats.timePhase3 = time.stop();
LOG.debug("--compression phase 3: " + _stats.timePhase3);
}
// PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
double[] dict = createSharedDDC1Dictionary(_colGroups);
if (dict != null) {
applySharedDDC1Dictionary(_colGroups, dict);
_sharedDDC1Dict = true;
}
if (LOG.isDebugEnabled()) {
_stats.timePhase4 = time.stop();
LOG.debug("--compression phase 4: " + _stats.timePhase4);
}
// The remaining columns are stored uncompressed as one big column group
if (!remainingCols.isEmpty()) {
ArrayList<Integer> list = new ArrayList<>(remainingCols);
ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
_colGroups.add(ucgroup);
}
_stats.size = estimateCompressedSizeInMemory();
_stats.ratio = estimateSizeInMemory() / _stats.size;
if (_stats.ratio < 1) {
if (LOG.isDebugEnabled())
LOG.debug("Abort block compression because compression ratio is less than 1.");
return new MatrixBlock().copyShallow(this);
}
// final cleanup (discard uncompressed block)
rawblock.cleanupBlock(true, true);
this.cleanupBlock(true, true);
if (LOG.isDebugEnabled()) {
_stats.timePhase5 = time.stop();
int[] counts = getColGroupCounts(_colGroups);
LOG.debug("--compression phase 5: " + _stats.timePhase5);
LOG.debug("--num col groups: " + _colGroups.size());
LOG.debug("--col groups types (OLE,RLE,DDC1,DDC2,UC): " + counts[2] + "," + counts[1] + "," + counts[3] + "," + counts[4] + "," + counts[0]);
LOG.debug("--col groups sizes (OLE,RLE,DDC1,DDC2,UC): " + counts[7] + "," + counts[6] + "," + counts[8] + "," + counts[9] + "," + counts[5]);
LOG.debug("--compressed size: " + _stats.size);
LOG.debug("--compression ratio: " + _stats.ratio);
}
return this;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class CompressedMatrixBlock method transposeSelfMatrixMultOperations.
@Override
public MatrixBlock transposeSelfMatrixMultOperations(MatrixBlock out, MMTSJType tstype, int k) {
// call uncompressed matrix mult if necessary
if (!isCompressed()) {
return super.transposeSelfMatrixMultOperations(out, tstype, k);
}
// multi-threaded tsmm of single uncompressed colgroup
if (isSingleUncompressedGroup()) {
return ((ColGroupUncompressed) _colGroups.get(0)).getData().transposeSelfMatrixMultOperations(out, tstype, k);
}
Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
// check for transpose type
if (// right not supported yet
tstype != MMTSJType.LEFT)
throw new DMLRuntimeException("Invalid MMTSJ type '" + tstype.toString() + "'.");
// create output matrix block
if (out == null)
out = new MatrixBlock(clen, clen, false);
else
out.reset(clen, clen, false);
out.allocateDenseBlock();
if (!isEmptyBlock(false)) {
// compute matrix mult
try {
ExecutorService pool = CommonThreadPool.get(k);
ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<>();
int numgrp = _colGroups.size();
int blklen = (int) (Math.ceil((double) numgrp / (2 * k)));
for (int i = 0; i < 2 * k & i * blklen < clen; i++) tasks.add(new MatrixMultTransposeTask(_colGroups, out, i * blklen, Math.min((i + 1) * blklen, numgrp)));
List<Future<Object>> ret = pool.invokeAll(tasks);
for (Future<Object> tret : ret) // check for errors
tret.get();
pool.shutdown();
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// post-processing
out.setNonZeros(LinearAlgebraUtils.copyUpperToLowerTriangle(out));
}
if (LOG.isDebugEnabled())
LOG.debug("Compressed TSMM k=" + k + " in " + time.stop());
return out;
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class ParWorker method executeRangeTask.
private void executeRangeTask(Task task) {
// monitoring start
Timing time1 = null, time2 = null;
if (_monitor) {
time1 = new Timing(true);
time2 = new Timing(true);
}
// core execution
List<IntObject> tmp = task.getIterations();
String lVarName = task.getVarName();
long lFrom = tmp.get(0).getLongValue();
long lTo = tmp.get(1).getLongValue();
long lIncr = tmp.get(2).getLongValue();
for (long i = lFrom; i <= lTo; i += lIncr) {
// set index values
_ec.setVariable(lVarName, new IntObject(i));
// for each program block
for (ProgramBlock pb : _childBlocks) pb.execute(_ec);
_numIters++;
if (_monitor)
StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_ITER_T, time1.stop());
}
_numTasks++;
// monitoring end
if (_monitor) {
StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_TASKSIZE, task.size());
StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_TASK_T, time2.stop());
}
}
use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.
the class ParForProgramBlock method executeRemoteMRParFor.
private void executeRemoteMRParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create and serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = false;
if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_MR))) {
// tid = 0 because replaced in remote parworker
flagForced = checkMRAndRecompileToCP(0);
}
// Step 1) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single
// job, cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 2) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String taskFile = constructTaskFileName();
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
int maxDigits = (int) Math.log10(to.getLongValue()) + 1;
long numCreatedTasks = -1;
if (USE_STREAMING_TASK_CREATION) {
LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
// put tasks into queue and start writing to taskFile
numCreatedTasks = partitioner.createTasks(queue);
taskFile = writeTasksToFile(taskFile, queue, maxDigits);
} else {
// sequentially create tasks and write to disk
List<Task> tasks = partitioner.createTasks();
numCreatedTasks = tasks.size();
taskFile = writeTasksToFile(taskFile, tasks, maxDigits);
}
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 3) submit MR job (wait for finished work)
MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix != null) ? ec.getMatrixObject(_colocatedDPMatrix) : null;
RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse));
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 4) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (// see step 0
flagForced)
releaseForcedRecompile(0);
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
Aggregations