use of org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig in project incubator-systemml by apache.
the class LibMatrixCUDA method reduceCol.
/**
* Do a reduction by column. Data is reduced per column and the
* resulting vector is calculated.
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to input matrix in device memory (size - rows * columns)
* @param out {@link Pointer} to output matrix in device memory (size - 1 * cols)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
}
use of org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig in project incubator-systemml by apache.
the class LibMatrixCUDA method reduceRow.
/**
* Do a reduction by row. Data is reduced per row and the
* resulting vector is calculated.
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to input matrix in device memory (size - rows * columns)
* @param out {@link Pointer} to output matrix in device memory (size - rows * 1)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
long t0 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
}
use of org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig in project incubator-systemml by apache.
the class LibMatrixCUDA method reduceAll.
/**
* Do a simple reduction, the output of which is a single value
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to matrix in device memory
* @param n size of array
* @return the reduced value
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
int[] tmp = getKernelParamsForReduceAll(gCtx, n);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
long t1 = 0, t2 = 0, t3 = 0;
if (GPUStatistics.DISPLAY_STATISTICS)
t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
int s = blocks;
while (s > 1) {
tmp = getKernelParamsForReduceAll(gCtx, s);
blocks = tmp[0];
threads = tmp[1];
sharedMem = tmp[2];
if (GPUStatistics.DISPLAY_STATISTICS)
t2 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
s = (s + (threads * 2 - 1)) / (threads * 2);
}
double[] result = { -1f };
if (GPUStatistics.DISPLAY_STATISTICS)
t3 = System.nanoTime();
cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
gCtx.cudaFreeHelper(instName, tempOut);
return result[0];
}
Aggregations