use of in project systemml by apache.
the class SpoofOuterProduct method execute.
public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) {
// sanity check
if (inputs == null || inputs.size() < 3 || out == null)
throw new RuntimeException("Invalid input arguments.");
// check empty result
if (// U is empty
(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT && inputs.get(1).isEmptyBlock(false)) || // V is empty
(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) || inputs.get(0).isEmptyBlock(false)) {
// X is empty
// turn empty dense into sparse
return out;
// input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction)
if (_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT) {
// assign it to the time and sparse representation of the major input matrix
out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat());
} else {
// if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output
if (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT)
// n*k
out.reset(inputs.get(0).getNumColumns(), inputs.get(1).getNumColumns(), false);
else if (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT)
// m*k
out.reset(inputs.get(0).getNumRows(), inputs.get(1).getNumColumns(), false);
// check for empty inputs; otherwise allocate result
if (inputs.get(0).isEmptyBlock(false))
return out;
// input preparation
DenseBlock[] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
SideInput[] b = prepInputMatrices(inputs, 3, false);
double[] scalars = prepInputScalars(scalarObjects);
// core sequential execute
final int m = inputs.get(0).getNumRows();
final int n = inputs.get(0).getNumColumns();
// rank
final int k = inputs.get(1).getNumColumns();
MatrixBlock a = inputs.get(0);
switch(_outerProductType) {
if (a instanceof CompressedMatrixBlock)
executeCompressed((CompressedMatrixBlock) a, ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, ((CompressedMatrixBlock) a).getNumColGroups());
else if (!a.isInSparseFormat())
executeDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n);
executeSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n);
if (a instanceof CompressedMatrixBlock)
executeCellwiseCompressed((CompressedMatrixBlock) a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, 0, n);
else if (!a.isInSparseFormat())
executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n);
executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n);
throw new DMLRuntimeException("Wrong codepath for aggregate outer product.");
// post-processing
if (a instanceof CompressedMatrixBlock && out.isInSparseFormat() && _outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT)
return out;
use of in project systemml by apache.
the class SpoofOuterProduct method executeCellwiseSparse.
private void executeCellwiseSparse(SparseBlock sblock, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars, MatrixBlock out, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) {
// NOTE: we don't create sparse side inputs w/ row-major cursors because
// cache blocking would lead to non-sequential access
final int blocksizeIJ = (int) (8L * m * n / nnz);
int[] curk = new int[Math.min(blocksizeIJ, ru - rl)];
if (// DENSE
!out.isInSparseFormat()) {
DenseBlock c = out.getDenseBlock();
double tmp = 0;
for (int bi = rl; bi < ru; bi += blocksizeIJ) {
int bimin = Math.min(ru, bi + blocksizeIJ);
// prepare starting indexes for block row
Arrays.fill(curk, 0);
// blocked execution over column blocks
for (int bj = 0; bj < n; bj += blocksizeIJ) {
int bjmin = Math.min(n, bj + blocksizeIJ);
for (int i = bi; i < bimin; i++) {
if (sblock.isEmpty(i))
int wpos = sblock.pos(i);
int wlen = sblock.size(i);
int[] wix = sblock.indexes(i);
double[] wvals = sblock.values(i);
double[] cvals = c.values(i);
double[] uvals = u.values(i);
int uix = u.pos(i);
int index = wpos + curk[i - bi];
if (type == OutProdType.CELLWISE_OUTER_PRODUCT)
for (; index < wpos + wlen && wix[index] < bjmin; index++) {
int jix = wix[index];
cvals[jix] = genexecCellwise(wvals[index], uvals, uix, v.values(jix), v.pos(jix), b, scalars, m, n, k, i, wix[index]);
for (; index < wpos + wlen && wix[index] < bjmin; index++) {
int jix = wix[index];
tmp += genexecCellwise(wvals[index], uvals, uix, v.values(jix), v.pos(jix), b, scalars, m, n, k, i, wix[index]);
curk[i - bi] = index - wpos;
if (type != OutProdType.CELLWISE_OUTER_PRODUCT)
c.set(0, 0, tmp);
} else // SPARSE
SparseBlock c = out.getSparseBlock();
for (int bi = rl; bi < ru; bi += blocksizeIJ) {
int bimin = Math.min(ru, bi + blocksizeIJ);
// prepare starting indexes for block row
Arrays.fill(curk, 0);
// blocked execution over column blocks
for (int bj = 0; bj < n; bj += blocksizeIJ) {
int bjmin = Math.min(n, bj + blocksizeIJ);
for (int i = bi; i < bimin; i++) {
if (sblock.isEmpty(i))
int wpos = sblock.pos(i);
int wlen = sblock.size(i);
int[] wix = sblock.indexes(i);
double[] wval = sblock.values(i);
double[] uvals = u.values(i);
int uix = u.pos(i);
int index = wpos + curk[i - bi];
for (; index < wpos + wlen && wix[index] < bjmin; index++) {
int jix = wix[index];
c.append(i, wix[index], genexecCellwise(wval[index], uvals, uix, v.values(jix), v.pos(jix), b, scalars, m, n, k, i, wix[index]));
curk[i - bi] = index - wpos;
use of in project systemml by apache.
the class ColGroupDDC2 method computeRowSums.
protected void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
// note: due to corrections the output might be a large dense block
DenseBlock c = result.getDenseBlock();
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
// pre-aggregate nnz per value tuple
double[] vals = sumAllValues(kplus, kbuff, false);
// for correctness in case of sqk+)
for (int i = rl; i < ru; i++) {
double[] cvals = c.values(i);
int cix = c.pos(i);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, vals[_data[i]]);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
use of in project systemml by apache.
the class ColGroupOLE method computeRowSums.
protected final void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
// note: due to corrections the output might be a large dense block
DenseBlock c = result.getDenseBlock();
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
final int numVals = getNumValues();
if (ALLOW_CACHE_CONSCIOUS_ROWSUMS && LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz) {
final int blksz2 = ColGroupOffset.WRITE_CACHE_BLKSZ / 2;
// step 1: prepare position and value arrays
int[] apos = skipScan(numVals, rl);
double[] aval = sumAllValues(kplus, kbuff, false);
// step 2: cache conscious row sums via horizontal scans
for (int bi = rl; bi < ru; bi += blksz2) {
int bimax = Math.min(bi + blksz2, ru);
// horizontal segment scan, incl pos maintenance
for (int k = 0; k < numVals; k++) {
int boff = _ptr[k];
int blen = len(k);
double val = aval[k];
int bix = apos[k];
for (int ii = bi; ii < bimax && bix < blen; ii += blksz) {
// prepare length, start, and end pos
int len = _data[boff + bix];
int pos = boff + bix + 1;
// compute partial results
for (int i = 0; i < len; i++) {
int rix = ii + _data[pos + i];
double[] cvals = c.values(rix);
int cix = c.pos(rix);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, val);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
bix += len + 1;
apos[k] = bix;
} else {
// iterate over all values and their bitmaps
for (int k = 0; k < numVals; k++) {
// prepare value-to-add for entire value bitmap
int boff = _ptr[k];
int blen = len(k);
double val = sumValues(k, kplus, kbuff);
// iterate over bitmap blocks and add values
if (val != 0) {
int slen;
int bix = skipScanVal(k, rl);
for (int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
slen = _data[boff + bix];
for (int i = 1; i <= slen; i++) {
int rix = off + _data[boff + bix + i];
double[] cvals = c.values(rix);
int cix = c.pos(rix);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, val);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
use of in project systemml by apache.
the class ColGroupRLE method computeRowSums.
protected final void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
// note: due to corrections the output might be a large dense block
DenseBlock c = result.getDenseBlock();
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
final int numVals = getNumValues();
if (ALLOW_CACHE_CONSCIOUS_ROWSUMS && LOW_LEVEL_OPT && numVals > 1 && _numRows > BitmapEncoder.BITMAP_BLOCK_SZ) {
final int blksz = ColGroupOffset.WRITE_CACHE_BLKSZ / 2;
// step 1: prepare position and value arrays
// current pos / values per RLE list
int[] astart = new int[numVals];
int[] apos = skipScan(numVals, rl, astart);
double[] aval = sumAllValues(kplus, kbuff, false);
// step 2: cache conscious matrix-vector via horizontal scans
for (int bi = rl; bi < ru; bi += blksz) {
int bimax = Math.min(bi + blksz, ru);
// horizontal segment scan, incl pos maintenance
for (int k = 0; k < numVals; k++) {
int boff = _ptr[k];
int blen = len(k);
double val = aval[k];
int bix = apos[k];
int start = astart[k];
// compute partial results, not aligned
while (bix < blen) {
int lstart = _data[boff + bix];
int llen = _data[boff + bix + 1];
int from = Math.max(bi, start + lstart);
int to = Math.min(start + lstart + llen, bimax);
for (int rix = from; rix < to; rix++) {
double[] cvals = c.values(rix);
int cix = c.pos(rix);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, val);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
if (start + lstart + llen >= bimax)
start += lstart + llen;
bix += 2;
apos[k] = bix;
astart[k] = start;
} else {
for (int k = 0; k < numVals; k++) {
int boff = _ptr[k];
int blen = len(k);
double val = sumValues(k, kplus, kbuff);
if (val != 0.0) {
Pair<Integer, Integer> tmp = skipScanVal(k, rl);
int bix = tmp.getKey();
int curRunStartOff = tmp.getValue();
int curRunEnd = tmp.getValue();
for (; bix < blen && curRunEnd < ru; bix += 2) {
curRunStartOff = curRunEnd + _data[boff + bix];
curRunEnd = curRunStartOff + _data[boff + bix + 1];
for (int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++) {
double[] cvals = c.values(rix);
int cix = c.pos(rix);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, val);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;