use of org.apache.sysml.runtime.compress.estim.CompressedSizeInfo in project systemml by apache.
the class CompressedMatrixBlock method compress.
/**
* Compress block.
*
* @param k number of threads
* @return compressed matrix block or original block if incompressible
*/
public MatrixBlock compress(int k) {
// check for redundant compression
if (isCompressed()) {
throw new DMLRuntimeException("Redundant compression, block already compressed.");
}
Timing time = new Timing(true);
_stats = new CompressionStatistics();
// SAMPLE-BASED DECISIONS:
// Decisions such as testing if a column is amenable to bitmap
// compression or evaluating co-coding potentionls are made based on a
// subset of the rows. For large datasets, sampling might take a
// significant amount of time. So, we generate only one sample and use
// it for the entire compression process.
// prepare basic meta data and deep copy / transpose input
final int numRows = getNumRows();
final int numCols = getNumColumns();
final boolean sparse = isInSparseFormat();
MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) : LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
// construct sample-based size estimator
CompressedSizeEstimator bitmapSizeEstimator = SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
// PHASE 1: Classify columns by compression type
// We start by determining which columns are amenable to compression
List<Integer> colsC = new ArrayList<>();
List<Integer> colsUC = new ArrayList<>();
HashMap<Integer, Double> compRatios = new HashMap<>();
// Classify columns according to ratio (size uncompressed / size compressed),
// where a column is compressible if ratio > 1.
CompressedSizeInfo[] sizeInfos = (k > 1) ? computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) : computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
long nnzUC = 0;
for (int col = 0; col < numCols; col++) {
double uncompSize = getUncompressedSize(numRows, 1, OptimizerUtils.getSparsity(numRows, 1, sizeInfos[col].getEstNnz()));
double compRatio = uncompSize / sizeInfos[col].getMinSize();
if (compRatio > 1) {
colsC.add(col);
compRatios.put(col, compRatio);
} else {
colsUC.add(col);
nnzUC += sizeInfos[col].getEstNnz();
}
}
// correction of column classification (reevaluate dense estimates if necessary)
boolean sparseUC = MatrixBlock.evalSparseFormatInMemory(numRows, colsUC.size(), nnzUC);
if (!sparseUC && !colsUC.isEmpty()) {
for (int i = 0; i < colsUC.size(); i++) {
int col = colsUC.get(i);
double uncompSize = getUncompressedSize(numRows, 1, 1.0);
double compRatio = uncompSize / sizeInfos[col].getMinSize();
if (compRatio > 1) {
colsC.add(col);
colsUC.remove(i);
i--;
compRatios.put(col, compRatio);
nnzUC -= sizeInfos[col].getEstNnz();
}
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("C: " + Arrays.toString(colsC.toArray(new Integer[0])));
LOG.trace("-- compression ratios: " + Arrays.toString(colsC.stream().map(c -> compRatios.get(c)).toArray()));
LOG.trace("UC: " + Arrays.toString(colsUC.toArray(new Integer[0])));
LOG.trace("-- compression ratios: " + Arrays.toString(colsUC.stream().map(c -> compRatios.get(c)).toArray()));
}
if (LOG.isDebugEnabled()) {
_stats.timePhase1 = time.stop();
LOG.debug("Compression statistics:");
LOG.debug("--compression phase 1: " + _stats.timePhase1);
}
if (colsC.isEmpty()) {
if (LOG.isDebugEnabled())
LOG.debug("Abort block compression because all columns are incompressible.");
return new MatrixBlock().copyShallow(this);
}
// PHASE 2: Grouping columns
// Divide the bitmap columns into column groups.
List<int[]> bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(bitmapSizeEstimator, colsC, sizeInfos, numRows, k);
if (LOG.isDebugEnabled()) {
_stats.timePhase2 = time.stop();
LOG.debug("--compression phase 2: " + _stats.timePhase2);
}
if (INVESTIGATE_ESTIMATES) {
double est = 0;
for (int[] groupIndices : bitmapColGrps) est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
est += MatrixBlock.estimateSizeInMemory(numRows, colsUC.size(), OptimizerUtils.getSparsity(numRows, colsUC.size(), nnzUC));
_stats.estSize = est;
}
// PHASE 3: Compress and correct sample-based decisions
ColGroup[] colGroups = (k > 1) ? compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty(), k) : compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty());
allocateColGroupList();
HashSet<Integer> remainingCols = seq(0, numCols - 1, 1);
for (int j = 0; j < colGroups.length; j++) {
if (colGroups[j] != null) {
for (int col : colGroups[j].getColIndices()) remainingCols.remove(col);
_colGroups.add(colGroups[j]);
}
}
if (LOG.isDebugEnabled()) {
_stats.timePhase3 = time.stop();
LOG.debug("--compression phase 3: " + _stats.timePhase3);
}
// PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
double[] dict = createSharedDDC1Dictionary(_colGroups);
if (dict != null) {
applySharedDDC1Dictionary(_colGroups, dict);
_sharedDDC1Dict = true;
}
if (LOG.isDebugEnabled()) {
_stats.timePhase4 = time.stop();
LOG.debug("--compression phase 4: " + _stats.timePhase4);
}
// The remaining columns are stored uncompressed as one big column group
if (!remainingCols.isEmpty()) {
ArrayList<Integer> list = new ArrayList<>(remainingCols);
ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
_colGroups.add(ucgroup);
}
_stats.size = estimateCompressedSizeInMemory();
_stats.ratio = estimateSizeInMemory() / _stats.size;
if (_stats.ratio < 1) {
if (LOG.isDebugEnabled())
LOG.debug("Abort block compression because compression ratio is less than 1.");
return new MatrixBlock().copyShallow(this);
}
// final cleanup (discard uncompressed block)
rawblock.cleanupBlock(true, true);
this.cleanupBlock(true, true);
if (LOG.isDebugEnabled()) {
_stats.timePhase5 = time.stop();
int[] counts = getColGroupCounts(_colGroups);
LOG.debug("--compression phase 5: " + _stats.timePhase5);
LOG.debug("--num col groups: " + _colGroups.size());
LOG.debug("--col groups types (OLE,RLE,DDC1,DDC2,UC): " + counts[2] + "," + counts[1] + "," + counts[3] + "," + counts[4] + "," + counts[0]);
LOG.debug("--col groups sizes (OLE,RLE,DDC1,DDC2,UC): " + counts[7] + "," + counts[6] + "," + counts[8] + "," + counts[9] + "," + counts[5]);
LOG.debug("--compressed size: " + _stats.size);
LOG.debug("--compression ratio: " + _stats.ratio);
}
return this;
}
Aggregations