Search in sources :

Example 1 with ExecutionConfig

use of org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig in project incubator-systemml by apache.

the class LibMatrixCUDA method reduceCol.

/**
	 * Do a reduction by column. Data is reduced per column and the
	 * resulting vector is calculated.
	 * @param gCtx            a valid {@link GPUContext}
	 * @param kernelFunction 	name of the kernel function to invoke
	 * @param in							{@link Pointer} to input matrix in device memory (size - rows * columns)
	 * @param out							{@link Pointer} to output matrix in device memory (size - 1 * cols)
	 * @param rows						number of rows in input matrix
	 * @param cols						number of columns in input matrix
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
    LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
    int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
    int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
    long t0 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
    //cudaDeviceSynchronize;
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
}
Also used : ExecutionConfig(org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig)

Example 2 with ExecutionConfig

use of org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig in project incubator-systemml by apache.

the class LibMatrixCUDA method reduceRow.

/**
	 * Do a reduction by row. Data is reduced per row and the
	 * resulting vector is calculated.
	 * @param gCtx            a valid {@link GPUContext}
	 * @param kernelFunction 	name of the kernel function to invoke
	 * @param in							{@link Pointer} to input matrix in device memory (size - rows * columns)
	 * @param out							{@link Pointer} to output matrix in device memory (size - rows * 1)
	 * @param rows						number of rows in input matrix
	 * @param cols						number of columns in input matrix
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
    LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
    int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
    int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
    long t0 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t0 = System.nanoTime();
    getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
    //cudaDeviceSynchronize;
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
}
Also used : ExecutionConfig(org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig)

Example 3 with ExecutionConfig

use of org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig in project incubator-systemml by apache.

the class LibMatrixCUDA method reduceAll.

/**
	 * Do a simple reduction, the output of which is a single value
	 * @param gCtx   a valid {@link GPUContext}
	 * @param kernelFunction 	name of the kernel function to invoke
	 * @param in							{@link Pointer} to matrix in device memory
	 * @param n								size of array
	 * @return	the reduced value
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
    LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
    int[] tmp = getKernelParamsForReduceAll(gCtx, n);
    int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
    Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
    long t1 = 0, t2 = 0, t3 = 0;
    if (GPUStatistics.DISPLAY_STATISTICS)
        t1 = System.nanoTime();
    getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
    //cudaDeviceSynchronize;
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
    int s = blocks;
    while (s > 1) {
        tmp = getKernelParamsForReduceAll(gCtx, s);
        blocks = tmp[0];
        threads = tmp[1];
        sharedMem = tmp[2];
        if (GPUStatistics.DISPLAY_STATISTICS)
            t2 = System.nanoTime();
        getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
        if (GPUStatistics.DISPLAY_STATISTICS)
            GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
        s = (s + (threads * 2 - 1)) / (threads * 2);
    }
    double[] result = { -1f };
    if (GPUStatistics.DISPLAY_STATISTICS)
        t3 = System.nanoTime();
    cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
    if (GPUStatistics.DISPLAY_STATISTICS)
        GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
    gCtx.cudaFreeHelper(instName, tempOut);
    return result[0];
}
Also used : CSRPointer(org.apache.sysml.runtime.instructions.gpu.context.CSRPointer) Pointer(jcuda.Pointer) ExecutionConfig(org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig)

Aggregations

ExecutionConfig (org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig)3 Pointer (jcuda.Pointer)1 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)1