Search in sources :

Example 21 with SparseBlock

use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.

the class GPUObject method copyFromHostToDevice.

void copyFromHostToDevice(String opcode) {
    if (LOG.isTraceEnabled()) {
        LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
    }
    long start = 0;
    if (DMLScript.STATISTICS)
        start = System.nanoTime();
    long acqrTime = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
    MatrixBlock tmp = mat.acquireRead();
    if (DMLScript.FINEGRAINED_STATISTICS) {
        if (tmp.isInSparseFormat())
            GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_SPARSE_MB, System.nanoTime() - acqrTime);
        else
            GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_DENSE_MB, System.nanoTime() - acqrTime);
    }
    if (tmp.isInSparseFormat()) {
        int[] rowPtr = null;
        int[] colInd = null;
        double[] values = null;
        // Only recompute non-zero if unknown, else this will incur huge penalty !!
        if (tmp.getNonZeros() < 0) {
            tmp.recomputeNonZeros();
        }
        long nnz = tmp.getNonZeros();
        mat.getMatrixCharacteristics().setNonZeros(nnz);
        SparseBlock block = tmp.getSparseBlock();
        boolean copyToDevice = true;
        if (block == null && tmp.getNonZeros() == 0) {
            // // Allocate empty block --> not necessary
            // // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
            // rowPtr = new int[0];
            // colInd = new int[0];
            // values = new double[0];
            copyToDevice = false;
        } else if (block == null && tmp.getNonZeros() != 0) {
            throw new DMLRuntimeException("Expected CP sparse block to be not null.");
        } else {
            // CSR is the preferred format for cuSparse GEMM
            // Converts MCSR and COO to CSR
            SparseBlockCSR csrBlock = null;
            long t0 = 0;
            if (block instanceof SparseBlockCSR) {
                csrBlock = (SparseBlockCSR) block;
            } else if (block instanceof SparseBlockCOO) {
                // TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
                if (DMLScript.STATISTICS)
                    t0 = System.nanoTime();
                SparseBlockCOO cooBlock = (SparseBlockCOO) block;
                csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionCount.increment();
            } else if (block instanceof SparseBlockMCSR) {
                if (DMLScript.STATISTICS)
                    t0 = System.nanoTime();
                SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
                csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionCount.increment();
            } else {
                throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
            }
            rowPtr = csrBlock.rowPointers();
            colInd = csrBlock.indexes();
            values = csrBlock.values();
        }
        allocateSparseMatrixOnDevice();
        if (copyToDevice) {
            long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
            CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
            if (DMLScript.FINEGRAINED_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
        }
    } else {
        double[] data = tmp.getDenseBlockValues();
        if (data == null && tmp.getSparseBlock() != null)
            throw new DMLRuntimeException("Incorrect sparsity calculation");
        else if (data == null && tmp.getNonZeros() != 0)
            throw new DMLRuntimeException("MatrixBlock is not allocated");
        allocateDenseMatrixOnDevice();
        if (tmp.getNonZeros() == 0) {
            // Minor optimization: No need to allocate empty error for CPU
            // data = new double[tmp.getNumRows() * tmp.getNumColumns()];
            long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
            cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
            if (DMLScript.FINEGRAINED_STATISTICS)
                GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1);
        } else {
            // Copy dense block
            // H2D now only measures the time taken to do
            LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getJcudaDenseMatrixPtr(), opcode);
        }
    }
    mat.release();
    if (DMLScript.STATISTICS)
        GPUStatistics.cudaToDevTime.add(System.nanoTime() - start);
    if (DMLScript.STATISTICS)
        GPUStatistics.cudaToDevCount.add(1);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) SparseBlockMCSR(org.apache.sysml.runtime.matrix.data.SparseBlockMCSR) SparseBlockCSR(org.apache.sysml.runtime.matrix.data.SparseBlockCSR) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock) SparseBlockCOO(org.apache.sysml.runtime.matrix.data.SparseBlockCOO) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 22 with SparseBlock

use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.

the class MatrixReader method createOutputMatrixBlock.

/**
 * NOTE: mallocDense controls if the output matrix blocks is fully allocated, this can be redundant
 * if binary block read and single block.
 *
 * @param rlen number of rows
 * @param clen number of columns
 * @param bclen number of columns in a block
 * @param brlen number of rows in a block
 * @param estnnz estimated number of non-zeros
 * @param mallocDense if true and not sparse, allocate dense block unsafe
 * @param mallocSparse if true and sparse, allocate sparse rows block
 * @return matrix block
 * @throws IOException if IOException occurs
 */
protected static MatrixBlock createOutputMatrixBlock(long rlen, long clen, int bclen, int brlen, long estnnz, boolean mallocDense, boolean mallocSparse) throws IOException {
    // check input dimension
    if (!OptimizerUtils.isValidCPDimensions(rlen, clen))
        throw new DMLRuntimeException("Matrix dimensions too large for CP runtime: " + rlen + " x " + clen);
    // determine target representation (sparse/dense)
    boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, estnnz);
    int numThreads = OptimizerUtils.getParallelBinaryReadParallelism();
    long numBlocks = (long) Math.ceil((double) rlen / brlen);
    // prepare result matrix block
    MatrixBlock ret = new MatrixBlock((int) rlen, (int) clen, sparse, estnnz);
    if (!sparse && mallocDense)
        ret.allocateDenseBlockUnsafe((int) rlen, (int) clen);
    else if (sparse && mallocSparse) {
        ret.allocateSparseRowsBlock();
        SparseBlock sblock = ret.getSparseBlock();
        // create synchronization points for MCSR (start row per block row)
        if (// multiple col blocks
        sblock instanceof SparseBlockMCSR && clen > bclen && clen >= 0 && bclen > 0 && rlen >= 0 && brlen > 0) {
            // adaptive change from scalar to row could cause synchronization issues
            if (numThreads <= numBlocks)
                for (int i = 0; i < rlen; i += brlen) sblock.allocate(i, Math.max((int) (estnnz / rlen), 2), (int) clen);
            else
                // allocate all rows to avoid contention
                for (int i = 0; i < rlen; i++) sblock.allocate(i, Math.max((int) (estnnz / rlen), 2), (int) clen);
        }
    }
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) SparseBlockMCSR(org.apache.sysml.runtime.matrix.data.SparseBlockMCSR) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 23 with SparseBlock

use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.

the class DataConverter method convertToFrameBlock.

public static FrameBlock convertToFrameBlock(MatrixBlock mb, ValueType[] schema) {
    FrameBlock frame = new FrameBlock(schema);
    Object[] row = new Object[mb.getNumColumns()];
    if (// SPARSE
    mb.isInSparseFormat()) {
        SparseBlock sblock = mb.getSparseBlock();
        for (int i = 0; i < mb.getNumRows(); i++) {
            // reset
            Arrays.fill(row, null);
            if (sblock != null && !sblock.isEmpty(i)) {
                int apos = sblock.pos(i);
                int alen = sblock.size(i);
                int[] aix = sblock.indexes(i);
                double[] aval = sblock.values(i);
                for (int j = apos; j < apos + alen; j++) {
                    row[aix[j]] = UtilFunctions.doubleToObject(schema[aix[j]], aval[j]);
                }
            }
            frame.appendRow(row);
        }
    } else // DENSE
    {
        int dFreq = UtilFunctions.frequency(schema, ValueType.DOUBLE);
        if (schema.length == 1 && dFreq == 1 && mb.isAllocated()) {
            // special case double schema and single columns which
            // allows for a shallow copy since the physical representation
            // of row-major matrix and column-major frame match exactly
            frame.reset();
            frame.appendColumns(new double[][] { mb.getDenseBlockValues() });
        } else if (dFreq == schema.length) {
            // special case double schema (without cell-object creation,
            // col pre-allocation, and cache-friendly row-column copy)
            int m = mb.getNumRows();
            int n = mb.getNumColumns();
            double[] a = mb.getDenseBlockValues();
            double[][] c = new double[n][m];
            // blocks of a/c+overhead in L1 cache
            int blocksizeIJ = 16;
            if (!mb.isEmptyBlock(false))
                for (int bi = 0; bi < m; bi += blocksizeIJ) for (int bj = 0; bj < n; bj += blocksizeIJ) {
                    int bimin = Math.min(bi + blocksizeIJ, m);
                    int bjmin = Math.min(bj + blocksizeIJ, n);
                    for (int i = bi, aix = bi * n; i < bimin; i++, aix += n) for (int j = bj; j < bjmin; j++) c[j][i] = a[aix + j];
                }
            frame.reset();
            frame.appendColumns(c);
        } else {
            // general case
            for (int i = 0; i < mb.getNumRows(); i++) {
                for (int j = 0; j < mb.getNumColumns(); j++) {
                    row[j] = UtilFunctions.doubleToObject(schema[j], mb.quickGetValue(i, j));
                }
                frame.appendRow(row);
            }
        }
    }
    return frame;
}
Also used : FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock)

Example 24 with SparseBlock

use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.

the class GPUObject method copyFromHostToDevice.

void copyFromHostToDevice() throws DMLRuntimeException {
    LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
    long start = 0;
    if (DMLScript.STATISTICS)
        start = System.nanoTime();
    MatrixBlock tmp = mat.acquireRead();
    if (tmp.isInSparseFormat()) {
        int[] rowPtr = null;
        int[] colInd = null;
        double[] values = null;
        tmp.recomputeNonZeros();
        long nnz = tmp.getNonZeros();
        mat.getMatrixCharacteristics().setNonZeros(nnz);
        SparseBlock block = tmp.getSparseBlock();
        boolean copyToDevice = true;
        if (block == null && tmp.getNonZeros() == 0) {
            //				// Allocate empty block --> not necessary
            //				// To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
            //				rowPtr = new int[0];
            //				colInd = new int[0];
            //				values = new double[0];
            copyToDevice = false;
        } else if (block == null && tmp.getNonZeros() != 0) {
            throw new DMLRuntimeException("Expected CP sparse block to be not null.");
        } else {
            // CSR is the preferred format for cuSparse GEMM
            // Converts MCSR and COO to CSR
            SparseBlockCSR csrBlock = null;
            long t0 = 0;
            if (block instanceof SparseBlockCSR) {
                csrBlock = (SparseBlockCSR) block;
            } else if (block instanceof SparseBlockCOO) {
                // TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
                if (DMLScript.STATISTICS)
                    t0 = System.nanoTime();
                SparseBlockCOO cooBlock = (SparseBlockCOO) block;
                csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionCount.incrementAndGet();
            } else if (block instanceof SparseBlockMCSR) {
                if (DMLScript.STATISTICS)
                    t0 = System.nanoTime();
                SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
                csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
                if (DMLScript.STATISTICS)
                    GPUStatistics.cudaSparseConversionCount.incrementAndGet();
            } else {
                throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
            }
            rowPtr = csrBlock.rowPointers();
            colInd = csrBlock.indexes();
            values = csrBlock.values();
        }
        allocateSparseMatrixOnDevice();
        if (copyToDevice) {
            CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
        }
    } else {
        double[] data = tmp.getDenseBlock();
        if (data == null && tmp.getSparseBlock() != null)
            throw new DMLRuntimeException("Incorrect sparsity calculation");
        else if (data == null && tmp.getNonZeros() != 0)
            throw new DMLRuntimeException("MatrixBlock is not allocated");
        else if (tmp.getNonZeros() == 0)
            data = new double[tmp.getNumRows() * tmp.getNumColumns()];
        // Copy dense block
        allocateDenseMatrixOnDevice();
        cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice);
    }
    mat.release();
    if (DMLScript.STATISTICS)
        GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - start);
    if (DMLScript.STATISTICS)
        GPUStatistics.cudaToDevCount.addAndGet(1);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) SparseBlockMCSR(org.apache.sysml.runtime.matrix.data.SparseBlockMCSR) SparseBlockCSR(org.apache.sysml.runtime.matrix.data.SparseBlockCSR) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock) SparseBlockCOO(org.apache.sysml.runtime.matrix.data.SparseBlockCOO) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

SparseBlock (org.apache.sysml.runtime.matrix.data.SparseBlock)24 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)14 SparseBlockMCSR (org.apache.sysml.runtime.matrix.data.SparseBlockMCSR)12 SparseBlockCOO (org.apache.sysml.runtime.matrix.data.SparseBlockCOO)11 SparseBlockCSR (org.apache.sysml.runtime.matrix.data.SparseBlockCSR)11 IJV (org.apache.sysml.runtime.matrix.data.IJV)6 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)4 LongLongDoubleHashMap (org.apache.sysml.runtime.util.LongLongDoubleHashMap)2 ADoubleEntry (org.apache.sysml.runtime.util.LongLongDoubleHashMap.ADoubleEntry)2 BufferedWriter (java.io.BufferedWriter)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Iterator (java.util.Iterator)1 DoubleIntListHashMap (org.apache.sysml.runtime.compress.utils.DoubleIntListHashMap)1 IntArrayList (org.apache.sysml.runtime.compress.utils.IntArrayList)1 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)1 Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)1 KahanPlus (org.apache.sysml.runtime.functionobjects.KahanPlus)1 BooleanObject (org.apache.sysml.runtime.instructions.cp.BooleanObject)1 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)1 DenseBlock (org.apache.sysml.runtime.matrix.data.DenseBlock)1