Search in sources :

Example 1 with CompressedSizeEstimator

use of org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator in project incubator-systemml by apache.

the class CompressedMatrixBlock method compress.

/**
 * Compress block.
 *
 * @param k  number of threads
 * @return compressed matrix block or original block if incompressible
 */
public MatrixBlock compress(int k) {
    // check for redundant compression
    if (isCompressed()) {
        throw new DMLRuntimeException("Redundant compression, block already compressed.");
    }
    Timing time = new Timing(true);
    _stats = new CompressionStatistics();
    // SAMPLE-BASED DECISIONS:
    // Decisions such as testing if a column is amenable to bitmap
    // compression or evaluating co-coding potentionls are made based on a
    // subset of the rows. For large datasets, sampling might take a
    // significant amount of time. So, we generate only one sample and use
    // it for the entire compression process.
    // prepare basic meta data and deep copy / transpose input
    final int numRows = getNumRows();
    final int numCols = getNumColumns();
    final boolean sparse = isInSparseFormat();
    MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) : LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
    // construct sample-based size estimator
    CompressedSizeEstimator bitmapSizeEstimator = SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
    // PHASE 1: Classify columns by compression type
    // We start by determining which columns are amenable to compression
    List<Integer> colsC = new ArrayList<>();
    List<Integer> colsUC = new ArrayList<>();
    HashMap<Integer, Double> compRatios = new HashMap<>();
    // Classify columns according to ratio (size uncompressed / size compressed),
    // where a column is compressible if ratio > 1.
    CompressedSizeInfo[] sizeInfos = (k > 1) ? computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) : computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
    long nnzUC = 0;
    for (int col = 0; col < numCols; col++) {
        double uncompSize = getUncompressedSize(numRows, 1, OptimizerUtils.getSparsity(numRows, 1, sizeInfos[col].getEstNnz()));
        double compRatio = uncompSize / sizeInfos[col].getMinSize();
        if (compRatio > 1) {
            colsC.add(col);
            compRatios.put(col, compRatio);
        } else {
            colsUC.add(col);
            nnzUC += sizeInfos[col].getEstNnz();
        }
    }
    // correction of column classification (reevaluate dense estimates if necessary)
    boolean sparseUC = MatrixBlock.evalSparseFormatInMemory(numRows, colsUC.size(), nnzUC);
    if (!sparseUC && !colsUC.isEmpty()) {
        for (int i = 0; i < colsUC.size(); i++) {
            int col = colsUC.get(i);
            double uncompSize = getUncompressedSize(numRows, 1, 1.0);
            double compRatio = uncompSize / sizeInfos[col].getMinSize();
            if (compRatio > 1) {
                colsC.add(col);
                colsUC.remove(i);
                i--;
                compRatios.put(col, compRatio);
                nnzUC -= sizeInfos[col].getEstNnz();
            }
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("C: " + Arrays.toString(colsC.toArray(new Integer[0])));
        LOG.trace("-- compression ratios: " + Arrays.toString(colsC.stream().map(c -> compRatios.get(c)).toArray()));
        LOG.trace("UC: " + Arrays.toString(colsUC.toArray(new Integer[0])));
        LOG.trace("-- compression ratios: " + Arrays.toString(colsUC.stream().map(c -> compRatios.get(c)).toArray()));
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase1 = time.stop();
        LOG.debug("Compression statistics:");
        LOG.debug("--compression phase 1: " + _stats.timePhase1);
    }
    if (colsC.isEmpty()) {
        if (LOG.isDebugEnabled())
            LOG.debug("Abort block compression because all columns are incompressible.");
        return new MatrixBlock().copyShallow(this);
    }
    // PHASE 2: Grouping columns
    // Divide the bitmap columns into column groups.
    List<int[]> bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(bitmapSizeEstimator, colsC, sizeInfos, numRows, k);
    if (LOG.isDebugEnabled()) {
        _stats.timePhase2 = time.stop();
        LOG.debug("--compression phase 2: " + _stats.timePhase2);
    }
    if (INVESTIGATE_ESTIMATES) {
        double est = 0;
        for (int[] groupIndices : bitmapColGrps) est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
        est += MatrixBlock.estimateSizeInMemory(numRows, colsUC.size(), OptimizerUtils.getSparsity(numRows, colsUC.size(), nnzUC));
        _stats.estSize = est;
    }
    // PHASE 3: Compress and correct sample-based decisions
    ColGroup[] colGroups = (k > 1) ? compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty(), k) : compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty());
    allocateColGroupList();
    HashSet<Integer> remainingCols = seq(0, numCols - 1, 1);
    for (int j = 0; j < colGroups.length; j++) {
        if (colGroups[j] != null) {
            for (int col : colGroups[j].getColIndices()) remainingCols.remove(col);
            _colGroups.add(colGroups[j]);
        }
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase3 = time.stop();
        LOG.debug("--compression phase 3: " + _stats.timePhase3);
    }
    // PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
    double[] dict = createSharedDDC1Dictionary(_colGroups);
    if (dict != null) {
        applySharedDDC1Dictionary(_colGroups, dict);
        _sharedDDC1Dict = true;
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase4 = time.stop();
        LOG.debug("--compression phase 4: " + _stats.timePhase4);
    }
    // The remaining columns are stored uncompressed as one big column group
    if (!remainingCols.isEmpty()) {
        ArrayList<Integer> list = new ArrayList<>(remainingCols);
        ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
        _colGroups.add(ucgroup);
    }
    _stats.size = estimateCompressedSizeInMemory();
    _stats.ratio = estimateSizeInMemory() / _stats.size;
    if (_stats.ratio < 1) {
        if (LOG.isDebugEnabled())
            LOG.debug("Abort block compression because compression ratio is less than 1.");
        return new MatrixBlock().copyShallow(this);
    }
    // final cleanup (discard uncompressed block)
    rawblock.cleanupBlock(true, true);
    this.cleanupBlock(true, true);
    if (LOG.isDebugEnabled()) {
        _stats.timePhase5 = time.stop();
        int[] counts = getColGroupCounts(_colGroups);
        LOG.debug("--compression phase 5: " + _stats.timePhase5);
        LOG.debug("--num col groups: " + _colGroups.size());
        LOG.debug("--col groups types (OLE,RLE,DDC1,DDC2,UC): " + counts[2] + "," + counts[1] + "," + counts[3] + "," + counts[4] + "," + counts[0]);
        LOG.debug("--col groups sizes (OLE,RLE,DDC1,DDC2,UC): " + counts[7] + "," + counts[6] + "," + counts[8] + "," + counts[9] + "," + counts[5]);
        LOG.debug("--compressed size: " + _stats.size);
        LOG.debug("--compression ratio: " + _stats.ratio);
    }
    return this;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CompressedSizeEstimator(org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Aggregations

ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)1 CompressedSizeEstimator (org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator)1 CompressedSizeInfo (org.apache.sysml.runtime.compress.estim.CompressedSizeInfo)1 Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)1 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)1