Search in sources :

Example 1 with CompressedSizeInfo

use of org.apache.sysml.runtime.compress.estim.CompressedSizeInfo in project incubator-systemml by apache.

the class CompressedMatrixBlock method compress.

/**
 * Compress block.
 *
 * @param k  number of threads
 * @return compressed matrix block or original block if incompressible
 */
public MatrixBlock compress(int k) {
    // check for redundant compression
    if (isCompressed()) {
        throw new DMLRuntimeException("Redundant compression, block already compressed.");
    }
    Timing time = new Timing(true);
    _stats = new CompressionStatistics();
    // SAMPLE-BASED DECISIONS:
    // Decisions such as testing if a column is amenable to bitmap
    // compression or evaluating co-coding potentionls are made based on a
    // subset of the rows. For large datasets, sampling might take a
    // significant amount of time. So, we generate only one sample and use
    // it for the entire compression process.
    // prepare basic meta data and deep copy / transpose input
    final int numRows = getNumRows();
    final int numCols = getNumColumns();
    final boolean sparse = isInSparseFormat();
    MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) : LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
    // construct sample-based size estimator
    CompressedSizeEstimator bitmapSizeEstimator = SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
    // PHASE 1: Classify columns by compression type
    // We start by determining which columns are amenable to compression
    List<Integer> colsC = new ArrayList<>();
    List<Integer> colsUC = new ArrayList<>();
    HashMap<Integer, Double> compRatios = new HashMap<>();
    // Classify columns according to ratio (size uncompressed / size compressed),
    // where a column is compressible if ratio > 1.
    CompressedSizeInfo[] sizeInfos = (k > 1) ? computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) : computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
    long nnzUC = 0;
    for (int col = 0; col < numCols; col++) {
        double uncompSize = getUncompressedSize(numRows, 1, OptimizerUtils.getSparsity(numRows, 1, sizeInfos[col].getEstNnz()));
        double compRatio = uncompSize / sizeInfos[col].getMinSize();
        if (compRatio > 1) {
            colsC.add(col);
            compRatios.put(col, compRatio);
        } else {
            colsUC.add(col);
            nnzUC += sizeInfos[col].getEstNnz();
        }
    }
    // correction of column classification (reevaluate dense estimates if necessary)
    boolean sparseUC = MatrixBlock.evalSparseFormatInMemory(numRows, colsUC.size(), nnzUC);
    if (!sparseUC && !colsUC.isEmpty()) {
        for (int i = 0; i < colsUC.size(); i++) {
            int col = colsUC.get(i);
            double uncompSize = getUncompressedSize(numRows, 1, 1.0);
            double compRatio = uncompSize / sizeInfos[col].getMinSize();
            if (compRatio > 1) {
                colsC.add(col);
                colsUC.remove(i);
                i--;
                compRatios.put(col, compRatio);
                nnzUC -= sizeInfos[col].getEstNnz();
            }
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("C: " + Arrays.toString(colsC.toArray(new Integer[0])));
        LOG.trace("-- compression ratios: " + Arrays.toString(colsC.stream().map(c -> compRatios.get(c)).toArray()));
        LOG.trace("UC: " + Arrays.toString(colsUC.toArray(new Integer[0])));
        LOG.trace("-- compression ratios: " + Arrays.toString(colsUC.stream().map(c -> compRatios.get(c)).toArray()));
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase1 = time.stop();
        LOG.debug("Compression statistics:");
        LOG.debug("--compression phase 1: " + _stats.timePhase1);
    }
    if (colsC.isEmpty()) {
        if (LOG.isDebugEnabled())
            LOG.debug("Abort block compression because all columns are incompressible.");
        return new MatrixBlock().copyShallow(this);
    }
    // PHASE 2: Grouping columns
    // Divide the bitmap columns into column groups.
    List<int[]> bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(bitmapSizeEstimator, colsC, sizeInfos, numRows, k);
    if (LOG.isDebugEnabled()) {
        _stats.timePhase2 = time.stop();
        LOG.debug("--compression phase 2: " + _stats.timePhase2);
    }
    if (INVESTIGATE_ESTIMATES) {
        double est = 0;
        for (int[] groupIndices : bitmapColGrps) est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
        est += MatrixBlock.estimateSizeInMemory(numRows, colsUC.size(), OptimizerUtils.getSparsity(numRows, colsUC.size(), nnzUC));
        _stats.estSize = est;
    }
    // PHASE 3: Compress and correct sample-based decisions
    ColGroup[] colGroups = (k > 1) ? compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty(), k) : compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty());
    allocateColGroupList();
    HashSet<Integer> remainingCols = seq(0, numCols - 1, 1);
    for (int j = 0; j < colGroups.length; j++) {
        if (colGroups[j] != null) {
            for (int col : colGroups[j].getColIndices()) remainingCols.remove(col);
            _colGroups.add(colGroups[j]);
        }
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase3 = time.stop();
        LOG.debug("--compression phase 3: " + _stats.timePhase3);
    }
    // PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
    double[] dict = createSharedDDC1Dictionary(_colGroups);
    if (dict != null) {
        applySharedDDC1Dictionary(_colGroups, dict);
        _sharedDDC1Dict = true;
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase4 = time.stop();
        LOG.debug("--compression phase 4: " + _stats.timePhase4);
    }
    // The remaining columns are stored uncompressed as one big column group
    if (!remainingCols.isEmpty()) {
        ArrayList<Integer> list = new ArrayList<>(remainingCols);
        ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
        _colGroups.add(ucgroup);
    }
    _stats.size = estimateCompressedSizeInMemory();
    _stats.ratio = estimateSizeInMemory() / _stats.size;
    if (_stats.ratio < 1) {
        if (LOG.isDebugEnabled())
            LOG.debug("Abort block compression because compression ratio is less than 1.");
        return new MatrixBlock().copyShallow(this);
    }
    // final cleanup (discard uncompressed block)
    rawblock.cleanupBlock(true, true);
    this.cleanupBlock(true, true);
    if (LOG.isDebugEnabled()) {
        _stats.timePhase5 = time.stop();
        int[] counts = getColGroupCounts(_colGroups);
        LOG.debug("--compression phase 5: " + _stats.timePhase5);
        LOG.debug("--num col groups: " + _colGroups.size());
        LOG.debug("--col groups types (OLE,RLE,DDC1,DDC2,UC): " + counts[2] + "," + counts[1] + "," + counts[3] + "," + counts[4] + "," + counts[0]);
        LOG.debug("--col groups sizes (OLE,RLE,DDC1,DDC2,UC): " + counts[7] + "," + counts[6] + "," + counts[8] + "," + counts[9] + "," + counts[5]);
        LOG.debug("--compressed size: " + _stats.size);
        LOG.debug("--compression ratio: " + _stats.ratio);
    }
    return this;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CompressedSizeEstimator(org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Example 2 with CompressedSizeInfo

use of org.apache.sysml.runtime.compress.estim.CompressedSizeInfo in project systemml by apache.

the class CompressedMatrixBlock method computeCompressedSizeInfos.

private static CompressedSizeInfo[] computeCompressedSizeInfos(CompressedSizeEstimator estim, int clen, int k) {
    try {
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<SizeEstimTask> tasks = new ArrayList<>();
        for (int col = 0; col < clen; col++) tasks.add(new SizeEstimTask(estim, col));
        List<Future<CompressedSizeInfo>> rtask = pool.invokeAll(tasks);
        ArrayList<CompressedSizeInfo> ret = new ArrayList<>();
        for (Future<CompressedSizeInfo> lrtask : rtask) ret.add(lrtask.get());
        pool.shutdown();
        return ret.toArray(new CompressedSizeInfo[0]);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo) ExecutorService(java.util.concurrent.ExecutorService) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 3 with CompressedSizeInfo

use of org.apache.sysml.runtime.compress.estim.CompressedSizeInfo in project incubator-systemml by apache.

the class CompressedMatrixBlock method compressColGroup.

private static ColGroup compressColGroup(MatrixBlock in, CompressedSizeEstimator estim, HashMap<Integer, Double> compRatios, int rlen, int[] colIndexes, boolean denseEst) {
    int[] allGroupIndices = null;
    int allColsCount = colIndexes.length;
    CompressedSizeInfo sizeInfo;
    // The compression type is decided based on a full bitmap since it
    // will be reused for the actual compression step.
    UncompressedBitmap ubm = null;
    PriorityQueue<CompressedColumn> compRatioPQ = null;
    boolean skipGroup = false;
    while (true) {
        // exact big list and observe compression ratio
        ubm = BitmapEncoder.extractBitmap(colIndexes, in);
        sizeInfo = estim.estimateCompressedColGroupSize(ubm);
        double sp2 = denseEst ? 1.0 : OptimizerUtils.getSparsity(rlen, 1, ubm.getNumOffsets());
        double compRatio = getUncompressedSize(rlen, colIndexes.length, sp2) / sizeInfo.getMinSize();
        if (compRatio > 1) {
            // we have a good group
            break;
        }
        // modify the group
        if (compRatioPQ == null) {
            // first modification
            allGroupIndices = colIndexes.clone();
            compRatioPQ = new PriorityQueue<>();
            for (int i = 0; i < colIndexes.length; i++) compRatioPQ.add(new CompressedColumn(i, compRatios.get(colIndexes[i])));
        }
        // index in allGroupIndices
        int removeIx = compRatioPQ.poll().colIx;
        allGroupIndices[removeIx] = -1;
        allColsCount--;
        if (allColsCount == 0) {
            skipGroup = true;
            break;
        }
        colIndexes = new int[allColsCount];
        // copying the values that do not equal -1
        int ix = 0;
        for (int col : allGroupIndices) if (col != -1)
            colIndexes[ix++] = col;
    }
    // add group to uncompressed fallback
    if (skipGroup)
        return null;
    // create compressed column group
    long rleSize = sizeInfo.getRLESize();
    long oleSize = sizeInfo.getOLESize();
    long ddcSize = sizeInfo.getDDCSize();
    if (ALLOW_DDC_ENCODING && ddcSize < rleSize && ddcSize < oleSize) {
        if (ubm.getNumValues() <= 255)
            return new ColGroupDDC1(colIndexes, rlen, ubm);
        else
            return new ColGroupDDC2(colIndexes, rlen, ubm);
    } else if (rleSize < oleSize)
        return new ColGroupRLE(colIndexes, rlen, ubm);
    else
        return new ColGroupOLE(colIndexes, rlen, ubm);
}
Also used : CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo)

Example 4 with CompressedSizeInfo

use of org.apache.sysml.runtime.compress.estim.CompressedSizeInfo in project incubator-systemml by apache.

the class CompressedMatrixBlock method computeCompressedSizeInfos.

private static CompressedSizeInfo[] computeCompressedSizeInfos(CompressedSizeEstimator estim, int clen, int k) {
    try {
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<SizeEstimTask> tasks = new ArrayList<>();
        for (int col = 0; col < clen; col++) tasks.add(new SizeEstimTask(estim, col));
        List<Future<CompressedSizeInfo>> rtask = pool.invokeAll(tasks);
        ArrayList<CompressedSizeInfo> ret = new ArrayList<>();
        for (Future<CompressedSizeInfo> lrtask : rtask) ret.add(lrtask.get());
        pool.shutdown();
        return ret.toArray(new CompressedSizeInfo[0]);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo) ExecutorService(java.util.concurrent.ExecutorService) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 5 with CompressedSizeInfo

use of org.apache.sysml.runtime.compress.estim.CompressedSizeInfo in project systemml by apache.

the class CompressedMatrixBlock method compressColGroup.

private static ColGroup compressColGroup(MatrixBlock in, CompressedSizeEstimator estim, HashMap<Integer, Double> compRatios, int rlen, int[] colIndexes, boolean denseEst) {
    int[] allGroupIndices = null;
    int allColsCount = colIndexes.length;
    CompressedSizeInfo sizeInfo;
    // The compression type is decided based on a full bitmap since it
    // will be reused for the actual compression step.
    UncompressedBitmap ubm = null;
    PriorityQueue<CompressedColumn> compRatioPQ = null;
    boolean skipGroup = false;
    while (true) {
        // exact big list and observe compression ratio
        ubm = BitmapEncoder.extractBitmap(colIndexes, in);
        sizeInfo = estim.estimateCompressedColGroupSize(ubm);
        double sp2 = denseEst ? 1.0 : OptimizerUtils.getSparsity(rlen, 1, ubm.getNumOffsets());
        double compRatio = getUncompressedSize(rlen, colIndexes.length, sp2) / sizeInfo.getMinSize();
        if (compRatio > 1) {
            // we have a good group
            break;
        }
        // modify the group
        if (compRatioPQ == null) {
            // first modification
            allGroupIndices = colIndexes.clone();
            compRatioPQ = new PriorityQueue<>();
            for (int i = 0; i < colIndexes.length; i++) compRatioPQ.add(new CompressedColumn(i, compRatios.get(colIndexes[i])));
        }
        // index in allGroupIndices
        int removeIx = compRatioPQ.poll().colIx;
        allGroupIndices[removeIx] = -1;
        allColsCount--;
        if (allColsCount == 0) {
            skipGroup = true;
            break;
        }
        colIndexes = new int[allColsCount];
        // copying the values that do not equal -1
        int ix = 0;
        for (int col : allGroupIndices) if (col != -1)
            colIndexes[ix++] = col;
    }
    // add group to uncompressed fallback
    if (skipGroup)
        return null;
    // create compressed column group
    long rleSize = sizeInfo.getRLESize();
    long oleSize = sizeInfo.getOLESize();
    long ddcSize = sizeInfo.getDDCSize();
    if (ALLOW_DDC_ENCODING && ddcSize < rleSize && ddcSize < oleSize) {
        if (ubm.getNumValues() <= 255)
            return new ColGroupDDC1(colIndexes, rlen, ubm);
        else
            return new ColGroupDDC2(colIndexes, rlen, ubm);
    } else if (rleSize < oleSize)
        return new ColGroupRLE(colIndexes, rlen, ubm);
    else
        return new ColGroupOLE(colIndexes, rlen, ubm);
}
Also used : CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo)

Aggregations

CompressedSizeInfo (org.apache.sysml.runtime.compress.estim.CompressedSizeInfo)6 ArrayList (java.util.ArrayList)4 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)4 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 ExecutorService (java.util.concurrent.ExecutorService)2 Future (java.util.concurrent.Future)2 CompressedSizeEstimator (org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator)2 Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2