use of org.apache.sysml.runtime.functionobjects.KahanPlus in project incubator-systemml by apache.
the class LibMatrixCUDA method unaryAggregate.
// ********************************************************************/
// ******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
// ********************************************************************/
// ********************************************************************/
// **************** UNARY AGGREGATE Functions ************************/
// ********************************************************************/
/**
* Entry point to perform Unary aggregate operations on the GPU.
* The execution context object is used to allocate memory for the GPU.
*
* @param ec Instance of {@link ExecutionContext}, from which the output variable will be allocated
* @param gCtx a valid {@link GPUContext}
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param in1 input matrix
* @param output output matrix/scalar name
* @param op Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation.
*/
public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if (LOG.isTraceEnabled()) {
LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
}
final int REDUCTION_ALL = 1;
final int REDUCTION_ROW = 2;
final int REDUCTION_COL = 3;
final int REDUCTION_DIAG = 4;
// A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered,
// it just does regular summation reduction.
final int OP_PLUS = 1;
final int OP_PLUS_SQ = 2;
final int OP_MEAN = 3;
final int OP_VARIANCE = 4;
final int OP_MULTIPLY = 5;
final int OP_MAX = 6;
final int OP_MIN = 7;
final int OP_MAXINDEX = 8;
final int OP_MININDEX = 9;
// Sanity Checks
if (!in1.getGPUObject(gCtx).isAllocated())
throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());
boolean isSparse = in1.getGPUObject(gCtx).isSparse();
IndexFunction indexFn = op.indexFn;
AggregateOperator aggOp = op.aggOp;
// Convert Reduction direction to a number
int reductionDirection = -1;
if (indexFn instanceof ReduceAll) {
reductionDirection = REDUCTION_ALL;
} else if (indexFn instanceof ReduceRow) {
reductionDirection = REDUCTION_ROW;
} else if (indexFn instanceof ReduceCol) {
reductionDirection = REDUCTION_COL;
} else if (indexFn instanceof ReduceDiag) {
reductionDirection = REDUCTION_DIAG;
} else {
throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
}
if (reductionDirection == -1)
throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction");
// Convert function type to a number
int opIndex = -1;
if (aggOp.increOp.fn instanceof KahanPlus) {
opIndex = OP_PLUS;
} else if (aggOp.increOp.fn instanceof KahanPlusSq) {
opIndex = OP_PLUS_SQ;
} else if (aggOp.increOp.fn instanceof Mean) {
opIndex = OP_MEAN;
} else if (aggOp.increOp.fn instanceof CM) {
if (((CM) aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE)
throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU");
opIndex = OP_VARIANCE;
} else if (aggOp.increOp.fn instanceof Plus) {
opIndex = OP_PLUS;
} else if (aggOp.increOp.fn instanceof Multiply) {
opIndex = OP_MULTIPLY;
} else if (aggOp.increOp.fn instanceof Builtin) {
Builtin b = (Builtin) aggOp.increOp.fn;
switch(b.bFunc) {
case MAX:
opIndex = OP_MAX;
break;
case MIN:
opIndex = OP_MIN;
break;
case MAXINDEX:
opIndex = OP_MAXINDEX;
break;
case MININDEX:
opIndex = OP_MININDEX;
break;
default:
new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
}
} else {
throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
}
if (opIndex == -1)
throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction");
int rlen = (int) in1.getNumRows();
int clen = (int) in1.getNumColumns();
if (isSparse) {
// The strategy for the time being is to convert sparse to dense
// until a sparse specific kernel is written.
in1.getGPUObject(gCtx).sparseToDense(instName);
// long nnz = in1.getNnz();
// assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU";
// MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
// throw new DMLRuntimeException("Internal Error - Not implemented");
}
long outRLen = -1;
long outCLen = -1;
if (indexFn instanceof ReduceRow) {
// COL{SUM, MAX...}
outRLen = 1;
outCLen = clen;
} else if (indexFn instanceof ReduceCol) {
// ROW{SUM, MAX,...}
outRLen = rlen;
outCLen = 1;
}
Pointer out = null;
if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) {
// Matrix output
MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
out = getDensePointer(gCtx, out1, instName);
}
Pointer in = getDensePointer(gCtx, in1, instName);
int size = rlen * clen;
// For scalars, set the scalar output in the Execution Context object
switch(opIndex) {
case OP_PLUS:
{
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL:
{
// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
break;
}
case REDUCTION_ROW:
{
reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
break;
}
case REDUCTION_DIAG:
throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
}
break;
}
case OP_PLUS_SQ:
{
// Calculate the squares in a temporary object tmp
Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
squareMatrix(gCtx, instName, in, tmp, rlen, clen);
// Then do the sum on the temporary object and free it
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL:
{
// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
break;
}
case REDUCTION_ROW:
{
reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
}
gCtx.cudaFreeHelper(instName, tmp);
break;
}
case OP_MEAN:
{
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
double mean = result / size;
ec.setScalarOutput(output, new DoubleObject(mean));
break;
}
case REDUCTION_COL:
{
reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
break;
}
case REDUCTION_ROW:
{
reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
}
break;
}
case OP_MULTIPLY:
{
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
}
break;
}
case OP_MAX:
{
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_max", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL:
{
reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
break;
}
case REDUCTION_ROW:
{
reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
}
break;
}
case OP_MIN:
{
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_min", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL:
{
reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
break;
}
case REDUCTION_ROW:
{
reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
}
break;
}
case OP_VARIANCE:
{
// Temporary GPU array for
Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
switch(reductionDirection) {
case REDUCTION_ALL:
{
double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
double mean = result / size;
// Subtract mean from every element in the matrix
ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
double variance = result2 / (size - 1);
ec.setScalarOutput(output, new DoubleObject(variance));
break;
}
case REDUCTION_COL:
{
reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
// Subtract the row-wise mean from every element in the matrix
BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
gCtx.cudaFreeHelper(instName, tmpRow);
break;
}
case REDUCTION_ROW:
{
reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
// Subtract the columns-wise mean from every element in the matrix
BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
gCtx.cudaFreeHelper(instName, tmpCol);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
}
gCtx.cudaFreeHelper(instName, tmp);
gCtx.cudaFreeHelper(instName, tmp2);
break;
}
case OP_MAXINDEX:
{
switch(reductionDirection) {
case REDUCTION_COL:
throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
}
// break;
}
case OP_MININDEX:
{
switch(reductionDirection) {
case REDUCTION_COL:
throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
}
// break;
}
default:
throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
}
}
use of org.apache.sysml.runtime.functionobjects.KahanPlus in project incubator-systemml by apache.
the class SpoofCellwise method execute.
@Override
public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, int k) {
// sanity check
if (inputs == null || inputs.size() < 1)
throw new RuntimeException("Invalid input arguments.");
// input preparation
MatrixBlock a = inputs.get(0);
SideInput[] b = prepInputMatrices(inputs);
double[] scalars = prepInputScalars(scalarObjects);
final int m = a.getNumRows();
final int n = a.getNumColumns();
// sparse safe check
boolean sparseSafe = isSparseSafe() || (b.length == 0 && genexec(0, b, scalars, m, n, 0, 0) == 0);
long inputSize = sparseSafe ? getTotalInputNnz(inputs) : getTotalInputSize(inputs);
if (inputSize < PAR_NUMCELL_THRESHOLD) {
// serial execution
k = 1;
}
double ret = 0;
if (// SINGLE-THREADED
k <= 1) {
if (inputs.get(0) instanceof CompressedMatrixBlock)
ret = executeCompressedAndAgg((CompressedMatrixBlock) a, b, scalars, m, n, sparseSafe, 0, m);
else if (!inputs.get(0).isInSparseFormat())
ret = executeDenseAndAgg(a.getDenseBlock(), b, scalars, m, n, sparseSafe, 0, m);
else
ret = executeSparseAndAgg(a.getSparseBlock(), b, scalars, m, n, sparseSafe, 0, m);
} else // MULTI-THREADED
{
try {
ExecutorService pool = CommonThreadPool.get(k);
ArrayList<ParAggTask> tasks = new ArrayList<>();
int nk = (a instanceof CompressedMatrixBlock) ? k : UtilFunctions.roundToNext(Math.min(8 * k, m / 32), k);
int blklen = (int) (Math.ceil((double) m / nk));
if (a instanceof CompressedMatrixBlock)
blklen = BitmapEncoder.getAlignedBlocksize(blklen);
for (int i = 0; i < nk & i * blklen < m; i++) tasks.add(new ParAggTask(a, b, scalars, m, n, sparseSafe, i * blklen, Math.min((i + 1) * blklen, m)));
// execute tasks
List<Future<Double>> taskret = pool.invokeAll(tasks);
pool.shutdown();
// aggregate partial results
ValueFunction vfun = getAggFunction();
if (vfun instanceof KahanFunction) {
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
for (Future<Double> task : taskret) kplus.execute2(kbuff, task.get());
ret = kbuff._sum;
} else {
for (Future<Double> task : taskret) ret = vfun.execute(ret, task.get());
}
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
}
// correction for min/max
if ((_aggOp == AggOp.MIN || _aggOp == AggOp.MAX) && sparseSafe && a.getNonZeros() < a.getNumRows() * a.getNumColumns())
// unseen 0 might be max or min value
ret = getAggFunction().execute(ret, 0);
return new DoubleObject(ret);
}
use of org.apache.sysml.runtime.functionobjects.KahanPlus in project incubator-systemml by apache.
the class SpoofMultiAggregate method aggregatePartialResults.
public static void aggregatePartialResults(AggOp[] aggOps, MatrixBlock c, MatrixBlock b) {
ValueFunction[] vfun = getAggFunctions(aggOps);
for (int k = 0; k < aggOps.length; k++) {
if (vfun[k] instanceof KahanFunction) {
KahanObject kbuff = new KahanObject(c.quickGetValue(0, k), 0);
KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
kplus.execute2(kbuff, b.quickGetValue(0, k));
c.quickSetValue(0, k, kbuff._sum);
} else {
double cval = c.quickGetValue(0, k);
double bval = b.quickGetValue(0, k);
c.quickSetValue(0, k, vfun[k].execute(cval, bval));
}
}
}
use of org.apache.sysml.runtime.functionobjects.KahanPlus in project incubator-systemml by apache.
the class SpoofMultiAggregate method aggregatePartialResults.
private void aggregatePartialResults(double[] c, ArrayList<double[]> pret) {
ValueFunction[] vfun = getAggFunctions(_aggOps);
for (int k = 0; k < _aggOps.length; k++) {
if (vfun[k] instanceof KahanFunction) {
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
for (double[] tmp : pret) kplus.execute2(kbuff, tmp[k]);
c[k] = kbuff._sum;
} else {
for (double[] tmp : pret) c[k] = vfun[k].execute(c[k], tmp[k]);
}
}
}
use of org.apache.sysml.runtime.functionobjects.KahanPlus in project incubator-systemml by apache.
the class ColGroupDDC1 method computeRowSums.
public static void computeRowSums(ColGroupDDC1[] grps, MatrixBlock result, KahanFunction kplus, int rl, int ru) {
// note: due to corrections the output might be a large dense block
DenseBlock c = result.getDenseBlock();
KahanObject kbuff = new KahanObject(0, 0);
KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
// prepare distinct values once
double[][] vals = new double[grps.length][];
for (int i = 0; i < grps.length; i++) {
// pre-aggregate all distinct values (guaranteed <=255)
vals[i] = grps[i].sumAllValues(kplus, kbuff);
}
// cache-conscious row sums operations
// iterative over codes of all groups and add to output
// (use kahan plus not general KahanFunction for correctness in case of sqk+)
// 16KB
int blksz = 1024;
double[] tmpAgg = new double[blksz];
for (int bi = rl; bi < ru; bi += blksz) {
Arrays.fill(tmpAgg, 0);
// aggregate all groups
for (int j = 0; j < grps.length; j++) {
double[] valsj = vals[j];
byte[] dataj = grps[j]._data;
for (int i = bi; i < Math.min(bi + blksz, ru); i++) tmpAgg[i - bi] += valsj[dataj[i] & 0xFF];
}
// add partial results of all ddc groups
for (int i = bi; i < Math.min(bi + blksz, ru); i++) {
double[] cvals = c.values(i);
int cix = c.pos(i);
kbuff.set(cvals[cix], cvals[cix + 1]);
kplus2.execute2(kbuff, tmpAgg[i - bi]);
cvals[cix] = kbuff._sum;
cvals[cix + 1] = kbuff._correction;
}
}
}
Aggregations