Search in sources :

Example 86 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCUDA method sliceOperations.

// ********************************************************************/
// **************** Matrix Manipulation Functions *********************/
// ********************************************************************/
/**
 * Method to perform rightIndex operation for a given lower and upper bounds in row and column dimensions.
 *
 * @param ec current execution context
 * @param gCtx current gpu context
 * @param instName name of the instruction for maintaining statistics
 * @param in1 input matrix object
 * @param ixrange index range (0-based)
 * @param outputName output matrix object
 */
public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, IndexRange ixrange, String outputName) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    if (LOG.isTraceEnabled()) {
        LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx);
    }
    int rl = (int) ixrange.rowStart;
    int ru = (int) ixrange.rowEnd;
    int cl = (int) ixrange.colStart;
    int cu = (int) ixrange.colEnd;
    if (rl < 0 || rl >= in1.getNumRows() || ru < rl || ru >= in1.getNumRows() || cl < 0 || cu >= in1.getNumColumns() || cu < cl || cu >= in1.getNumColumns()) {
        throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + "," + (cl + 1) + ":" + (cu + 1) + "] " + "must be within matrix dimensions [" + in1.getNumRows() + "," + in1.getNumColumns() + "]");
    }
    int len1 = toInt(in1.getNumColumns());
    if (isInSparseFormat(gCtx, in1)) {
        // Input in1 is in sparse format and output is in dense format
        MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
        CSRPointer inPointer = getSparsePointer(gCtx, in1, instName);
        Pointer outPointer = getDensePointer(gCtx, out, instName);
        sliceSparseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1);
    } else {
        // Input in1 is in dense format (see inPointer)
        MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
        Pointer inPointer = getDensePointer(gCtx, in1, instName);
        Pointer outPointer = getDensePointer(gCtx, out, instName);
        sliceDenseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 87 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCUDA method setOutputToConstant.

/**
 * Fills an an array on the GPU with a given scalar value
 * @param ec					currently active instance of the {@link ExecutionContext}
 * @param gCtx        a valid {@link GPUContext}
 * @param instName    name of the invoking instruction to record{@link Statistics}.
 * @param constant		scalar value with which to fill the matrix
 * @param outputName	(internal) name of the matrix that is to be filled
 * @param numRows number of rows of output matrix object
 * @param numCols number of columns of output matrix object
 */
private static void setOutputToConstant(ExecutionContext ec, GPUContext gCtx, String instName, double constant, String outputName, long numRows, long numCols) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    if (constant == 0) {
        getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, 0, instName, outputName);
    } else {
        // Allocated the dense output matrix
        MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols);
        Pointer A = getDensePointer(gCtx, out, instName);
        int rlen = toInt(out.getNumRows());
        int clen = toInt(out.getNumColumns());
        long t0 = 0;
        if (DMLScript.FINEGRAINED_STATISTICS)
            t0 = System.nanoTime();
        int size = rlen * clen;
        getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
        if (DMLScript.FINEGRAINED_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0);
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 88 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCUDA method matrixMatrixOp.

/**
 * Utility to launch binary cellwise matrix-matrix operations CUDA kernel
 *
 * @param gCtx              a valid {@link GPUContext}
 * @param ec                execution context
 * @param instName          the invoking instruction's name for record {@link Statistics}.
 * @param in1               left input matrix
 * @param in2               right input matrix
 * @param outputName        output variable name
 * @param isLeftTransposed  true if left matrix is transposed
 * @param isRightTransposed true if right matrix is transposed
 * @param op                operator
 */
private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    boolean isEmpty1 = isSparseAndEmpty(gCtx, in1);
    boolean isEmpty2 = isSparseAndEmpty(gCtx, in2);
    int rlenA = toInt(in1.getNumRows());
    int rlenB = toInt(in2.getNumRows());
    int clenA = toInt(in1.getNumColumns());
    int clenB = toInt(in2.getNumColumns());
    int vecStatusA = getVectorStatus(rlenA, clenA).code();
    int vecStatusB = getVectorStatus(rlenB, clenB).code();
    if (isLeftTransposed || isRightTransposed) {
        throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
    }
    long outRLen = Math.max(rlenA, rlenB);
    long outCLen = Math.max(clenA, clenB);
    if (isEmpty1 && isEmpty2) {
        MatrixObject out = ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
        // When both inputs are empty, the output is empty too (except in the case of division)
        if (op.fn instanceof Divide || op.fn instanceof IntegerDivide || op.fn instanceof Modulus) {
            out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN);
        } else if (op.fn instanceof Minus1Multiply) {
            out.getGPUObject(gCtx).allocateAndFillDense(1.0);
        } else {
            out.getGPUObject(gCtx).allocateSparseAndEmpty();
        }
    } else // Check for M1 * M2 when M1 is empty; if M2 is a vector then fallback to general case
    if (isEmpty1 && clenB != 1 && rlenB != 1) {
        // C = empty_in1 op in2 ==> becomes ==> C = 0.0 op in2
        matrixScalarArithmetic(ec, gCtx, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
    } else // Check for M1 * M2 when M2 is empty; if M1 is a vector then fallback to general case
    if (isEmpty2 && clenA != 1 && rlenA != 1) {
        // C = in1 op empty_in2 ==> becomes ==> C = in1 op 0.0
        matrixScalarArithmetic(ec, gCtx, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
    } else {
        // TODO: FIXME: Implement sparse binCellSparseOp kernel
        Pointer A = getDensePointer(gCtx, in1, instName);
        // TODO: FIXME: Implement sparse binCellSparseOp kernel
        Pointer B = getDensePointer(gCtx, in2, instName);
        // Allocated the dense output matrix
        MatrixObject out = null;
        try {
            out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
        } catch (DMLRuntimeException e) {
            throw new DMLRuntimeException("Incorrect dimensions: dimA:[" + rlenA + "," + clenA + "]" + " dimB:[" + rlenB + "," + clenB + "] out:[" + outRLen + "," + outCLen + "]", e);
        }
        Pointer C = getDensePointer(gCtx, out, instName);
        int maxRlen = Math.max(rlenA, rlenB);
        int maxClen = Math.max(clenA, clenB);
        matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
    }
}
Also used : IntegerDivide(org.apache.sysml.runtime.functionobjects.IntegerDivide) Divide(org.apache.sysml.runtime.functionobjects.Divide) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Modulus(org.apache.sysml.runtime.functionobjects.Modulus) LeftScalarOperator(org.apache.sysml.runtime.matrix.operators.LeftScalarOperator) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) RightScalarOperator(org.apache.sysml.runtime.matrix.operators.RightScalarOperator) IntegerDivide(org.apache.sysml.runtime.functionobjects.IntegerDivide) Minus1Multiply(org.apache.sysml.runtime.functionobjects.Minus1Multiply) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 89 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCUDA method solve.

/**
 * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
 *
 * @param ec         a valid {@link ExecutionContext}
 * @param gCtx       a valid {@link GPUContext}
 * @param instName   the invoking instruction's name for record {@link Statistics}.
 * @param in1        input matrix A
 * @param in2        input matrix B
 * @param outputName name of the output matrix
 */
public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    // x = solve(A, b)
    if (LOG.isTraceEnabled()) {
        LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
    }
    long t0 = -1;
    GPUObject Aobj = in1.getGPUObject(gCtx);
    if (isInSparseFormat(gCtx, in1))
        Aobj.sparseToDense(instName);
    GPUObject bobj = in2.getGPUObject(gCtx);
    if (isInSparseFormat(gCtx, in2))
        bobj.sparseToDense(instName);
    int m = (int) in1.getNumRows();
    int n = (int) in1.getNumColumns();
    if ((int) in2.getNumRows() != m)
        throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
    if ((int) in2.getNumColumns() != 1)
        throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
    // and are destructive to the original input
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    GPUObject ATobj = (GPUObject) Aobj.clone();
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    ATobj.denseRowMajorToColumnMajor();
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
    Pointer A = ATobj.getJcudaDenseMatrixPtr();
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    GPUObject bTobj = (GPUObject) bobj.clone();
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    bTobj.denseRowMajorToColumnMajor();
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
    Pointer b = bTobj.getJcudaDenseMatrixPtr();
    // step 3: query working space of geqrf and ormqr
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    int[] lwork = { 0 };
    cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
    // step 4: compute QR factorization
    Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType);
    Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
    Pointer devInfo = gCtx.allocate(Sizeof.INT);
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
    int[] qrError = { -1 };
    cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
    if (qrError[0] != 0) {
        throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
    }
    // step 5: compute Q^T*B
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
    cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
    if (qrError[0] != 0) {
        throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
    }
    // step 6: compute x = R \ Q^T*B
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, n, 1, dataTypePointerTo(1.0), A, m, b, m);
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
    if (DMLScript.FINEGRAINED_STATISTICS)
        t0 = System.nanoTime();
    bTobj.denseColumnMajorToRowMajor();
    if (DMLScript.FINEGRAINED_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
    // TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
    // There is an avoidable copy happening here
    MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
    cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
    gCtx.cudaFreeHelper(instName, work);
    gCtx.cudaFreeHelper(instName, tau);
    ATobj.clearData();
    bTobj.clearData();
// debugPrintMatrix(b, n, 1);
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) GPUObject(org.apache.sysml.runtime.instructions.gpu.context.GPUObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 90 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class LibMatrixCUDA method unaryAggregate.

// ********************************************************************/
// ******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
// ********************************************************************/
// ********************************************************************/
// ****************  UNARY AGGREGATE Functions ************************/
// ********************************************************************/
/**
 * Entry point to perform Unary aggregate operations on the GPU.
 * The execution context object is used to allocate memory for the GPU.
 *
 * @param ec       Instance of {@link ExecutionContext}, from which the output variable will be allocated
 * @param gCtx     a valid {@link GPUContext}
 * @param instName name of the invoking instruction to record{@link Statistics}.
 * @param in1      input matrix
 * @param output   output matrix/scalar name
 * @param op       Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation.
 */
public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) {
    if (ec.getGPUContext(0) != gCtx)
        throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
    if (LOG.isTraceEnabled()) {
        LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
    }
    final int REDUCTION_ALL = 1;
    final int REDUCTION_ROW = 2;
    final int REDUCTION_COL = 3;
    final int REDUCTION_DIAG = 4;
    // A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered,
    // it just does regular summation reduction.
    final int OP_PLUS = 1;
    final int OP_PLUS_SQ = 2;
    final int OP_MEAN = 3;
    final int OP_VARIANCE = 4;
    final int OP_MULTIPLY = 5;
    final int OP_MAX = 6;
    final int OP_MIN = 7;
    final int OP_MAXINDEX = 8;
    final int OP_MININDEX = 9;
    // Sanity Checks
    if (!in1.getGPUObject(gCtx).isAllocated())
        throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());
    boolean isSparse = in1.getGPUObject(gCtx).isSparse();
    IndexFunction indexFn = op.indexFn;
    AggregateOperator aggOp = op.aggOp;
    // Convert Reduction direction to a number
    int reductionDirection = -1;
    if (indexFn instanceof ReduceAll) {
        reductionDirection = REDUCTION_ALL;
    } else if (indexFn instanceof ReduceRow) {
        reductionDirection = REDUCTION_ROW;
    } else if (indexFn instanceof ReduceCol) {
        reductionDirection = REDUCTION_COL;
    } else if (indexFn instanceof ReduceDiag) {
        reductionDirection = REDUCTION_DIAG;
    } else {
        throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
    }
    if (reductionDirection == -1)
        throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction");
    // Convert function type to a number
    int opIndex = -1;
    if (aggOp.increOp.fn instanceof KahanPlus) {
        opIndex = OP_PLUS;
    } else if (aggOp.increOp.fn instanceof KahanPlusSq) {
        opIndex = OP_PLUS_SQ;
    } else if (aggOp.increOp.fn instanceof Mean) {
        opIndex = OP_MEAN;
    } else if (aggOp.increOp.fn instanceof CM) {
        if (((CM) aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE)
            throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU");
        opIndex = OP_VARIANCE;
    } else if (aggOp.increOp.fn instanceof Plus) {
        opIndex = OP_PLUS;
    } else if (aggOp.increOp.fn instanceof Multiply) {
        opIndex = OP_MULTIPLY;
    } else if (aggOp.increOp.fn instanceof Builtin) {
        Builtin b = (Builtin) aggOp.increOp.fn;
        switch(b.bFunc) {
            case MAX:
                opIndex = OP_MAX;
                break;
            case MIN:
                opIndex = OP_MIN;
                break;
            case MAXINDEX:
                opIndex = OP_MAXINDEX;
                break;
            case MININDEX:
                opIndex = OP_MININDEX;
                break;
            default:
                new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
        }
    } else {
        throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
    }
    if (opIndex == -1)
        throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction");
    int rlen = (int) in1.getNumRows();
    int clen = (int) in1.getNumColumns();
    if (isSparse) {
        // The strategy for the time being is to convert sparse to dense
        // until a sparse specific kernel is written.
        in1.getGPUObject(gCtx).sparseToDense(instName);
    // long nnz = in1.getNnz();
    // assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU";
    // MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
    // throw new DMLRuntimeException("Internal Error - Not implemented");
    }
    long outRLen = -1;
    long outCLen = -1;
    if (indexFn instanceof ReduceRow) {
        // COL{SUM, MAX...}
        outRLen = 1;
        outCLen = clen;
    } else if (indexFn instanceof ReduceCol) {
        // ROW{SUM, MAX,...}
        outRLen = rlen;
        outCLen = 1;
    }
    Pointer out = null;
    if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) {
        // Matrix output
        MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
        out = getDensePointer(gCtx, out1, instName);
    }
    Pointer in = getDensePointer(gCtx, in1, instName);
    int size = rlen * clen;
    // For scalars, set the scalar output in the Execution Context object
    switch(opIndex) {
        case OP_PLUS:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
                            reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_DIAG:
                        throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
                }
                break;
            }
        case OP_PLUS_SQ:
            {
                // Calculate the squares in a temporary object tmp
                Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
                squareMatrix(gCtx, instName, in, tmp, rlen, clen);
                // Then do the sum on the temporary object and free it
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
                            reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
                }
                gCtx.cudaFreeHelper(instName, tmp);
                break;
            }
        case OP_MEAN:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
                            double mean = result / size;
                            ec.setScalarOutput(output, new DoubleObject(mean));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
                }
                break;
            }
        case OP_MULTIPLY:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
                }
                break;
            }
        case OP_MAX:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_max", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
                }
                break;
            }
        case OP_MIN:
            {
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_min", in, size);
                            ec.setScalarOutput(output, new DoubleObject(result));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
                }
                break;
            }
        case OP_VARIANCE:
            {
                // Temporary GPU array for
                Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
                Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
                switch(reductionDirection) {
                    case REDUCTION_ALL:
                        {
                            double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
                            double mean = result / size;
                            // Subtract mean from every element in the matrix
                            ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
                            matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
                            squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                            double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
                            double variance = result2 / (size - 1);
                            ec.setScalarOutput(output, new DoubleObject(variance));
                            break;
                        }
                    case REDUCTION_COL:
                        {
                            reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                            // Subtract the row-wise mean from every element in the matrix
                            BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                            matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
                            squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                            Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
                            reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
                            ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
                            matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
                            gCtx.cudaFreeHelper(instName, tmpRow);
                            break;
                        }
                    case REDUCTION_ROW:
                        {
                            reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                            // Subtract the columns-wise mean from every element in the matrix
                            BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                            matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
                            squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                            Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
                            reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
                            ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
                            matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
                            gCtx.cudaFreeHelper(instName, tmpCol);
                            break;
                        }
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
                }
                gCtx.cudaFreeHelper(instName, tmp);
                gCtx.cudaFreeHelper(instName, tmp2);
                break;
            }
        case OP_MAXINDEX:
            {
                switch(reductionDirection) {
                    case REDUCTION_COL:
                        throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
                }
            // break;
            }
        case OP_MININDEX:
            {
                switch(reductionDirection) {
                    case REDUCTION_COL:
                        throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
                    default:
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
                }
            // break;
            }
        default:
            throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
    }
}
Also used : ReduceCol(org.apache.sysml.runtime.functionobjects.ReduceCol) ScalarOperator(org.apache.sysml.runtime.matrix.operators.ScalarOperator) LeftScalarOperator(org.apache.sysml.runtime.matrix.operators.LeftScalarOperator) RightScalarOperator(org.apache.sysml.runtime.matrix.operators.RightScalarOperator) ReduceAll(org.apache.sysml.runtime.functionobjects.ReduceAll) Mean(org.apache.sysml.runtime.functionobjects.Mean) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ReduceDiag(org.apache.sysml.runtime.functionobjects.ReduceDiag) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) CM(org.apache.sysml.runtime.functionobjects.CM) CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) RightScalarOperator(org.apache.sysml.runtime.matrix.operators.RightScalarOperator) ReduceRow(org.apache.sysml.runtime.functionobjects.ReduceRow) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IndexFunction(org.apache.sysml.runtime.functionobjects.IndexFunction) Multiply(org.apache.sysml.runtime.functionobjects.Multiply) Minus1Multiply(org.apache.sysml.runtime.functionobjects.Minus1Multiply) AggregateOperator(org.apache.sysml.runtime.matrix.operators.AggregateOperator) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) KahanPlusSq(org.apache.sysml.runtime.functionobjects.KahanPlusSq) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) Plus(org.apache.sysml.runtime.functionobjects.Plus) BinaryOperator(org.apache.sysml.runtime.matrix.operators.BinaryOperator) Builtin(org.apache.sysml.runtime.functionobjects.Builtin)

Aggregations

MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)201 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)74 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)45 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)39 Data (org.apache.sysml.runtime.instructions.cp.Data)37 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)26 Pointer (jcuda.Pointer)20 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)20 IOException (java.io.IOException)17 ArrayList (java.util.ArrayList)16 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)14 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)13 CacheableData (org.apache.sysml.runtime.controlprogram.caching.CacheableData)12 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)12 Hop (org.apache.sysml.hops.Hop)11 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)11 ParForProgramBlock (org.apache.sysml.runtime.controlprogram.ParForProgramBlock)10 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)10 Path (org.apache.hadoop.fs.Path)9 LongWritable (org.apache.hadoop.io.LongWritable)9