use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.
the class GPUObject method copyFromHostToDevice.
void copyFromHostToDevice(String opcode) {
if (LOG.isTraceEnabled()) {
LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
}
long start = 0;
if (DMLScript.STATISTICS)
start = System.nanoTime();
long acqrTime = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
MatrixBlock tmp = mat.acquireRead();
if (DMLScript.FINEGRAINED_STATISTICS) {
if (tmp.isInSparseFormat())
GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_SPARSE_MB, System.nanoTime() - acqrTime);
else
GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_DENSE_MB, System.nanoTime() - acqrTime);
}
if (tmp.isInSparseFormat()) {
int[] rowPtr = null;
int[] colInd = null;
double[] values = null;
// Only recompute non-zero if unknown, else this will incur huge penalty !!
if (tmp.getNonZeros() < 0) {
tmp.recomputeNonZeros();
}
long nnz = tmp.getNonZeros();
mat.getMatrixCharacteristics().setNonZeros(nnz);
SparseBlock block = tmp.getSparseBlock();
boolean copyToDevice = true;
if (block == null && tmp.getNonZeros() == 0) {
// // Allocate empty block --> not necessary
// // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
// rowPtr = new int[0];
// colInd = new int[0];
// values = new double[0];
copyToDevice = false;
} else if (block == null && tmp.getNonZeros() != 0) {
throw new DMLRuntimeException("Expected CP sparse block to be not null.");
} else {
// CSR is the preferred format for cuSparse GEMM
// Converts MCSR and COO to CSR
SparseBlockCSR csrBlock = null;
long t0 = 0;
if (block instanceof SparseBlockCSR) {
csrBlock = (SparseBlockCSR) block;
} else if (block instanceof SparseBlockCOO) {
// TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
SparseBlockCOO cooBlock = (SparseBlockCOO) block;
csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionCount.increment();
} else if (block instanceof SparseBlockMCSR) {
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionCount.increment();
} else {
throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
}
rowPtr = csrBlock.rowPointers();
colInd = csrBlock.indexes();
values = csrBlock.values();
}
allocateSparseMatrixOnDevice();
if (copyToDevice) {
long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
}
} else {
double[] data = tmp.getDenseBlockValues();
if (data == null && tmp.getSparseBlock() != null)
throw new DMLRuntimeException("Incorrect sparsity calculation");
else if (data == null && tmp.getNonZeros() != 0)
throw new DMLRuntimeException("MatrixBlock is not allocated");
allocateDenseMatrixOnDevice();
if (tmp.getNonZeros() == 0) {
// Minor optimization: No need to allocate empty error for CPU
// data = new double[tmp.getNumRows() * tmp.getNumColumns()];
long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
if (DMLScript.FINEGRAINED_STATISTICS)
GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1);
} else {
// Copy dense block
// H2D now only measures the time taken to do
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getJcudaDenseMatrixPtr(), opcode);
}
}
mat.release();
if (DMLScript.STATISTICS)
GPUStatistics.cudaToDevTime.add(System.nanoTime() - start);
if (DMLScript.STATISTICS)
GPUStatistics.cudaToDevCount.add(1);
}
use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.
the class MatrixReader method createOutputMatrixBlock.
/**
* NOTE: mallocDense controls if the output matrix blocks is fully allocated, this can be redundant
* if binary block read and single block.
*
* @param rlen number of rows
* @param clen number of columns
* @param bclen number of columns in a block
* @param brlen number of rows in a block
* @param estnnz estimated number of non-zeros
* @param mallocDense if true and not sparse, allocate dense block unsafe
* @param mallocSparse if true and sparse, allocate sparse rows block
* @return matrix block
* @throws IOException if IOException occurs
*/
protected static MatrixBlock createOutputMatrixBlock(long rlen, long clen, int bclen, int brlen, long estnnz, boolean mallocDense, boolean mallocSparse) throws IOException {
// check input dimension
if (!OptimizerUtils.isValidCPDimensions(rlen, clen))
throw new DMLRuntimeException("Matrix dimensions too large for CP runtime: " + rlen + " x " + clen);
// determine target representation (sparse/dense)
boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, estnnz);
int numThreads = OptimizerUtils.getParallelBinaryReadParallelism();
long numBlocks = (long) Math.ceil((double) rlen / brlen);
// prepare result matrix block
MatrixBlock ret = new MatrixBlock((int) rlen, (int) clen, sparse, estnnz);
if (!sparse && mallocDense)
ret.allocateDenseBlockUnsafe((int) rlen, (int) clen);
else if (sparse && mallocSparse) {
ret.allocateSparseRowsBlock();
SparseBlock sblock = ret.getSparseBlock();
// create synchronization points for MCSR (start row per block row)
if (// multiple col blocks
sblock instanceof SparseBlockMCSR && clen > bclen && clen >= 0 && bclen > 0 && rlen >= 0 && brlen > 0) {
// adaptive change from scalar to row could cause synchronization issues
if (numThreads <= numBlocks)
for (int i = 0; i < rlen; i += brlen) sblock.allocate(i, Math.max((int) (estnnz / rlen), 2), (int) clen);
else
// allocate all rows to avoid contention
for (int i = 0; i < rlen; i++) sblock.allocate(i, Math.max((int) (estnnz / rlen), 2), (int) clen);
}
}
return ret;
}
use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.
the class DataConverter method convertToFrameBlock.
public static FrameBlock convertToFrameBlock(MatrixBlock mb, ValueType[] schema) {
FrameBlock frame = new FrameBlock(schema);
Object[] row = new Object[mb.getNumColumns()];
if (// SPARSE
mb.isInSparseFormat()) {
SparseBlock sblock = mb.getSparseBlock();
for (int i = 0; i < mb.getNumRows(); i++) {
// reset
Arrays.fill(row, null);
if (sblock != null && !sblock.isEmpty(i)) {
int apos = sblock.pos(i);
int alen = sblock.size(i);
int[] aix = sblock.indexes(i);
double[] aval = sblock.values(i);
for (int j = apos; j < apos + alen; j++) {
row[aix[j]] = UtilFunctions.doubleToObject(schema[aix[j]], aval[j]);
}
}
frame.appendRow(row);
}
} else // DENSE
{
int dFreq = UtilFunctions.frequency(schema, ValueType.DOUBLE);
if (schema.length == 1 && dFreq == 1 && mb.isAllocated()) {
// special case double schema and single columns which
// allows for a shallow copy since the physical representation
// of row-major matrix and column-major frame match exactly
frame.reset();
frame.appendColumns(new double[][] { mb.getDenseBlockValues() });
} else if (dFreq == schema.length) {
// special case double schema (without cell-object creation,
// col pre-allocation, and cache-friendly row-column copy)
int m = mb.getNumRows();
int n = mb.getNumColumns();
double[] a = mb.getDenseBlockValues();
double[][] c = new double[n][m];
// blocks of a/c+overhead in L1 cache
int blocksizeIJ = 16;
if (!mb.isEmptyBlock(false))
for (int bi = 0; bi < m; bi += blocksizeIJ) for (int bj = 0; bj < n; bj += blocksizeIJ) {
int bimin = Math.min(bi + blocksizeIJ, m);
int bjmin = Math.min(bj + blocksizeIJ, n);
for (int i = bi, aix = bi * n; i < bimin; i++, aix += n) for (int j = bj; j < bjmin; j++) c[j][i] = a[aix + j];
}
frame.reset();
frame.appendColumns(c);
} else {
// general case
for (int i = 0; i < mb.getNumRows(); i++) {
for (int j = 0; j < mb.getNumColumns(); j++) {
row[j] = UtilFunctions.doubleToObject(schema[j], mb.quickGetValue(i, j));
}
frame.appendRow(row);
}
}
}
return frame;
}
use of org.apache.sysml.runtime.matrix.data.SparseBlock in project incubator-systemml by apache.
the class GPUObject method copyFromHostToDevice.
void copyFromHostToDevice() throws DMLRuntimeException {
LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
long start = 0;
if (DMLScript.STATISTICS)
start = System.nanoTime();
MatrixBlock tmp = mat.acquireRead();
if (tmp.isInSparseFormat()) {
int[] rowPtr = null;
int[] colInd = null;
double[] values = null;
tmp.recomputeNonZeros();
long nnz = tmp.getNonZeros();
mat.getMatrixCharacteristics().setNonZeros(nnz);
SparseBlock block = tmp.getSparseBlock();
boolean copyToDevice = true;
if (block == null && tmp.getNonZeros() == 0) {
// // Allocate empty block --> not necessary
// // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
// rowPtr = new int[0];
// colInd = new int[0];
// values = new double[0];
copyToDevice = false;
} else if (block == null && tmp.getNonZeros() != 0) {
throw new DMLRuntimeException("Expected CP sparse block to be not null.");
} else {
// CSR is the preferred format for cuSparse GEMM
// Converts MCSR and COO to CSR
SparseBlockCSR csrBlock = null;
long t0 = 0;
if (block instanceof SparseBlockCSR) {
csrBlock = (SparseBlockCSR) block;
} else if (block instanceof SparseBlockCOO) {
// TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
SparseBlockCOO cooBlock = (SparseBlockCOO) block;
csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
} else if (block instanceof SparseBlockMCSR) {
if (DMLScript.STATISTICS)
t0 = System.nanoTime();
SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
if (DMLScript.STATISTICS)
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
} else {
throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
}
rowPtr = csrBlock.rowPointers();
colInd = csrBlock.indexes();
values = csrBlock.values();
}
allocateSparseMatrixOnDevice();
if (copyToDevice) {
CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
}
} else {
double[] data = tmp.getDenseBlock();
if (data == null && tmp.getSparseBlock() != null)
throw new DMLRuntimeException("Incorrect sparsity calculation");
else if (data == null && tmp.getNonZeros() != 0)
throw new DMLRuntimeException("MatrixBlock is not allocated");
else if (tmp.getNonZeros() == 0)
data = new double[tmp.getNumRows() * tmp.getNumColumns()];
// Copy dense block
allocateDenseMatrixOnDevice();
cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice);
}
mat.release();
if (DMLScript.STATISTICS)
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - start);
if (DMLScript.STATISTICS)
GPUStatistics.cudaToDevCount.addAndGet(1);
}
Aggregations