use of org.apache.sysml.runtime.matrix.data.DenseBlock in project incubator-systemml by apache.
the class ColGroupRLE method computeRowSums.
@Override
protected final void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
// note: due to corrections the output might be a large dense block
DenseBlock c = result.getDenseBlock();
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
final int numVals = getNumValues();
if (ALLOW_CACHE_CONSCIOUS_ROWSUMS && LOW_LEVEL_OPT && numVals > 1 && _numRows > BitmapEncoder.BITMAP_BLOCK_SZ) {
final int blksz = ColGroupOffset.WRITE_CACHE_BLKSZ / 2;
// step 1: prepare position and value arrays
// current pos / values per RLE list
int[] astart = new int[numVals];
int[] apos = skipScan(numVals, rl, astart);
double[] aval = sumAllValues(kplus, kbuff, false);
// step 2: cache conscious matrix-vector via horizontal scans
for (int bi = rl; bi < ru; bi += blksz) {
int bimax = Math.min(bi + blksz, ru);
// horizontal segment scan, incl pos maintenance
for (int k = 0; k < numVals; k++) {
int boff = _ptr[k];
int blen = len(k);
double val = aval[k];
int bix = apos[k];
int start = astart[k];
// compute partial results, not aligned
while (bix < blen) {
int lstart = _data[boff + bix];
int llen = _data[boff + bix + 1];
int from = Math.max(bi, start + lstart);
int to = Math.min(start + lstart + llen, bimax);
for (int rix = from; rix < to; rix++) {
double[] cvals = c.values(rix);
int cix = c.pos(rix);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, val);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
}
if (start + lstart + llen >= bimax)
break;
start += lstart + llen;
bix += 2;
}
apos[k] = bix;
astart[k] = start;
}
}
} else {
for (int k = 0; k < numVals; k++) {
int boff = _ptr[k];
int blen = len(k);
double val = sumValues(k, kplus, kbuff);
if (val != 0.0) {
Pair<Integer, Integer> tmp = skipScanVal(k, rl);
int bix = tmp.getKey();
int curRunStartOff = tmp.getValue();
int curRunEnd = tmp.getValue();
for (; bix < blen && curRunEnd < ru; bix += 2) {
curRunStartOff = curRunEnd + _data[boff + bix];
curRunEnd = curRunStartOff + _data[boff + bix + 1];
for (int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++) {
double[] cvals = c.values(rix);
int cix = c.pos(rix);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, val);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
}
}
}
}
}
}
use of org.apache.sysml.runtime.matrix.data.DenseBlock in project systemml by apache.
the class SpoofCellwise method executeDense.
// ///////
// function dispatch
private long executeDense(DenseBlock a, SideInput[] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) {
DenseBlock c = out.getDenseBlock();
SideInput[] lb = createSparseSideInputs(b);
if (_type == CellType.NO_AGG) {
return executeDenseNoAgg(a, lb, scalars, c, m, n, sparseSafe, rl, ru);
} else if (_type == CellType.ROW_AGG) {
if (_aggOp == AggOp.SUM || _aggOp == AggOp.SUM_SQ)
return executeDenseRowAggSum(a, lb, scalars, c, m, n, sparseSafe, rl, ru);
else
return executeDenseRowAggMxx(a, lb, scalars, c, m, n, sparseSafe, rl, ru);
} else if (_type == CellType.COL_AGG) {
if (_aggOp == AggOp.SUM || _aggOp == AggOp.SUM_SQ)
return executeDenseColAggSum(a, lb, scalars, c, m, n, sparseSafe, rl, ru);
else
return executeDenseColAggMxx(a, lb, scalars, c, m, n, sparseSafe, rl, ru);
}
return -1;
}
use of org.apache.sysml.runtime.matrix.data.DenseBlock in project systemml by apache.
the class SpoofOuterProduct method execute.
@Override
public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads) {
// sanity check
if (inputs == null || inputs.size() < 3 || out == null)
throw new RuntimeException("Invalid input arguments.");
// check empty result
if (// U is empty
(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT && inputs.get(1).isEmptyBlock(false)) || // V is empty
(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) || inputs.get(0).isEmptyBlock(false)) {
// X is empty
// turn empty dense into sparse
out.examSparsity();
return out;
}
// input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction)
if (_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT) {
// assign it to the time and sparse representation of the major input matrix
out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat());
out.allocateBlock();
} else {
// if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output
if (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT)
// n*k
out.reset(inputs.get(0).getNumColumns(), inputs.get(1).getNumColumns(), false);
else if (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT)
// m*k
out.reset(inputs.get(0).getNumRows(), inputs.get(1).getNumColumns(), false);
out.allocateDenseBlock();
}
if (2 * inputs.get(0).getNonZeros() * inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD)
// sequential
return execute(inputs, scalarObjects, out);
// input preparation
DenseBlock[] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
SideInput[] b = prepInputMatrices(inputs, 3, false);
double[] scalars = prepInputScalars(scalarObjects);
// core sequential execute
final int m = inputs.get(0).getNumRows();
final int n = inputs.get(0).getNumColumns();
// rank
final int k = inputs.get(1).getNumColumns();
final long nnz = inputs.get(0).getNonZeros();
MatrixBlock a = inputs.get(0);
try {
ExecutorService pool = CommonThreadPool.get(numThreads);
ArrayList<ParExecTask> tasks = new ArrayList<>();
if (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT) {
if (a instanceof CompressedMatrixBlock) {
// parallelize over column groups
int numCG = ((CompressedMatrixBlock) a).getNumColGroups();
int blklen = (int) (Math.ceil((double) numCG / numThreads));
for (int j = 0; j < numThreads & j * blklen < numCG; j++) tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, j * blklen, Math.min((j + 1) * blklen, numCG)));
} else {
// parallelize over column partitions
int blklen = (int) (Math.ceil((double) n / numThreads));
for (int j = 0; j < numThreads & j * blklen < n; j++) tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, j * blklen, Math.min((j + 1) * blklen, n)));
}
} else {
// right or cell-wise
// parallelize over row partitions
int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads);
int blklen = (int) (Math.ceil((double) m / numThreads2));
for (int i = 0; i < numThreads2 & i * blklen < m; i++) tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, i * blklen, Math.min((i + 1) * blklen, m), 0, n));
}
List<Future<Long>> taskret = pool.invokeAll(tasks);
pool.shutdown();
for (Future<Long> task : taskret) out.setNonZeros(out.getNonZeros() + task.get());
} catch (Exception e) {
throw new DMLRuntimeException(e);
}
// post-processing
if (a instanceof CompressedMatrixBlock) {
if (out.isInSparseFormat() && _outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT)
out.sortSparseRows();
else if (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT)
out.recomputeNonZeros();
}
out.examSparsity();
return out;
}
use of org.apache.sysml.runtime.matrix.data.DenseBlock in project systemml by apache.
the class SpoofRowwise method execute.
public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, boolean allocTmp, boolean aggIncr) {
// sanity check
if (inputs == null || inputs.size() < 1 || out == null)
throw new RuntimeException("Invalid input arguments.");
// result allocation and preparations
final int m = inputs.get(0).getNumRows();
final int n = inputs.get(0).getNumColumns();
final int n2 = _type.isConstDim2(_constDim2) ? (int) _constDim2 : _type.isRowTypeB1() || hasMatrixSideInput(inputs) ? getMinColsMatrixSideInputs(inputs) : -1;
if (!aggIncr || !out.isAllocated())
allocateOutputMatrix(m, n, n2, out);
DenseBlock c = out.getDenseBlock();
final boolean flipOut = _type.isRowTypeB1ColumnAgg() && LibSpoofPrimitives.isFlipOuter(out.getNumRows(), out.getNumColumns());
// input preparation
SideInput[] b = prepInputMatrices(inputs, 1, inputs.size() - 1, false, _tB1);
double[] scalars = prepInputScalars(scalarObjects);
// setup thread-local memory if necessary
if (allocTmp && _reqVectMem > 0)
LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n, n2);
// core sequential execute
MatrixBlock a = inputs.get(0);
if (a instanceof CompressedMatrixBlock)
executeCompressed((CompressedMatrixBlock) a, b, scalars, c, n, 0, m);
else if (!a.isInSparseFormat())
executeDense(a.getDenseBlock(), b, scalars, c, n, 0, m);
else
executeSparse(a.getSparseBlock(), b, scalars, c, n, 0, m);
// post-processing
if (allocTmp && _reqVectMem > 0)
LibSpoofPrimitives.cleanupThreadLocalMemory();
if (flipOut) {
fixTransposeDimensions(out);
out = LibMatrixReorg.transpose(out, new MatrixBlock(out.getNumColumns(), out.getNumRows(), false));
}
if (!aggIncr) {
out.recomputeNonZeros();
out.examSparsity();
}
return out;
}
use of org.apache.sysml.runtime.matrix.data.DenseBlock in project systemml by apache.
the class SpoofRowwise method execute.
@Override
public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) {
// redirect to serial execution
if (k <= 1 || (_type.isColumnAgg() && !LibMatrixMult.satisfiesMultiThreadingConstraints(inputs.get(0), k)) || getTotalInputSize(inputs) < PAR_NUMCELL_THRESHOLD) {
return execute(inputs, scalarObjects, out);
}
// sanity check
if (inputs == null || inputs.size() < 1 || out == null)
throw new RuntimeException("Invalid input arguments.");
// result allocation and preparations
final int m = inputs.get(0).getNumRows();
final int n = inputs.get(0).getNumColumns();
final int n2 = _type.isConstDim2(_constDim2) ? (int) _constDim2 : _type.isRowTypeB1() || hasMatrixSideInput(inputs) ? getMinColsMatrixSideInputs(inputs) : -1;
allocateOutputMatrix(m, n, n2, out);
final boolean flipOut = _type.isRowTypeB1ColumnAgg() && LibSpoofPrimitives.isFlipOuter(out.getNumRows(), out.getNumColumns());
// input preparation
MatrixBlock a = inputs.get(0);
SideInput[] b = prepInputMatrices(inputs, 1, inputs.size() - 1, false, _tB1);
double[] scalars = prepInputScalars(scalarObjects);
// core parallel execute
ExecutorService pool = CommonThreadPool.get(k);
ArrayList<Integer> blklens = (a instanceof CompressedMatrixBlock) ? UtilFunctions.getAlignedBlockSizes(m, k, BitmapEncoder.BITMAP_BLOCK_SZ) : UtilFunctions.getBalancedBlockSizesDefault(m, k, (long) m * n < 16 * PAR_NUMCELL_THRESHOLD);
try {
if (_type.isColumnAgg() || _type == RowType.FULL_AGG) {
// execute tasks
ArrayList<ParColAggTask> tasks = new ArrayList<>();
int outLen = out.getNumRows() * out.getNumColumns();
for (int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++) tasks.add(new ParColAggTask(a, b, scalars, n, n2, outLen, lb, lb + blklens.get(i)));
List<Future<DenseBlock>> taskret = pool.invokeAll(tasks);
// aggregate partial results
int len = _type.isColumnAgg() ? out.getNumRows() * out.getNumColumns() : 1;
for (Future<DenseBlock> task : taskret) LibMatrixMult.vectAdd(task.get().valuesAt(0), out.getDenseBlockValues(), 0, 0, len);
out.recomputeNonZeros();
} else {
// execute tasks
ArrayList<ParExecTask> tasks = new ArrayList<>();
for (int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++) tasks.add(new ParExecTask(a, b, out, scalars, n, n2, lb, lb + blklens.get(i)));
List<Future<Long>> taskret = pool.invokeAll(tasks);
// aggregate nnz, no need to aggregate results
long nnz = 0;
for (Future<Long> task : taskret) nnz += task.get();
out.setNonZeros(nnz);
}
pool.shutdown();
if (flipOut) {
fixTransposeDimensions(out);
out = LibMatrixReorg.transpose(out, new MatrixBlock(out.getNumColumns(), out.getNumRows(), false));
}
out.examSparsity();
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
return out;
}
Aggregations