【发布时间】:2019-09-12 04:29:19
【问题描述】:
我正在尝试使用 OpenCL 处理 640x480@25fps 的摄像头输入,让 GPU 进行图像处理,让 OpenCV 来捕获图像,问题是内核的性能太差了
内核:
__kernel void brightness(__global uchar *A, uchar B, __global uchar *C) {
int i = get_global_id(0);
C[i] = (A[i]+B) >= 255 ? 255 : (A[i]+B);
}
设置GPU:
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = 0;
//load the OpenCL code
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("./OpenCLFiles/brightness.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//get platfor and device information
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);
//create context
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
//create command queue
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
//create a program
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
//build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
//create the kernels
brightnessKernel = clCreateKernel(program, "brightness", &ret);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
// Create memory buffers on the device for each vector
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, listSize * sizeof(uchar), NULL, &ret);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, listSize * sizeof(uchar), NULL, &ret);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
//convert captured image to gray
Mat greyImage;
cvtColor(image, greyImage, COLOR_BGR2GRAY);
//"convert" Mat image to input array
uchar* input = greyImage.isContinuous()? greyImage.data: greyImage.clone().data;
//allocate memory for output array
uchar* output = (uchar*)malloc(sizeof(uchar)*listSize);
//write input array into GPU memory buffer
ret = clEnqueueWriteBuffer(command_queue, inputBuffer, CL_TRUE, 0, listSize * sizeof(uchar), input, 0, NULL, &eventWrite);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
// Set the arguments of the kernel
ret = clSetKernelArg(brightnessKernel, 0, sizeof(cl_mem), (void *) &inputBuffer);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
ret = clSetKernelArg(brightnessKernel, 1, sizeof(brightnessValue), (void *) &brightnessValue);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
ret = clSetKernelArg(brightnessKernel, 2, sizeof(cl_mem), (void *) &outputBuffer);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
// Execute the OpenCL kernel
size_t global_item_size = listSize; // Process the entire lists
size_t local_item_size = 12;
ret = clEnqueueNDRangeKernel(command_queue, brightnessKernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, &eventKernel);
if (ret != 0)
std::cerr << getErrorString(ret) << std::endl;
// Read the memory buffer on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, outputBuffer, CL_TRUE, 0,
listSize * sizeof(uchar), output, 0, NULL, &eventRead);
if (ret != 0)
printf("error writing to output buffer: %d\n\n\n", ret);
// Display the result to the screen
Mat inputImage(image.rows, image.cols, CV_8UC1, input);
Mat test(image.rows, image.cols, CV_8UC1, output);
imshow("Input", inputImage);
imshow("Convertedx2", test);
使用 OpenCL 的分析事件,结果如下:
OpenCL clEnqueueWriteBuffer:1.792 毫秒;
OpenCL 内核执行时间:85.851 毫秒;
OpenCL clEnqueueReadBuffer:1.581 毫秒;
如果我改变内核行
C[i] = (A[i]+B) >= 255 ? 255 : (A[i]+B);
到
C[i] = A[i];
情况变得更糟:
OpenCL clEnqueueWriteBuffer:1.266 毫秒;
OpenCL 内核执行时间:177.103 毫秒;
OpenCL clEnqueueReadBuffer:1.656 毫秒;
由于 GPU 的理论性能为 24 GFLOPS,我预计会有更好的结果,大约不到 1 毫秒。
【问题讨论】:
-
尝试使用 const 装饰输入,使用限制关键字装饰输入输出。
标签: raspberry-pi opencl