OpenCL 内存缓冲区未将正确的值传递给内核答案

【问题标题】：OpenCL Memory Buffer not passing the right values to kernelOpenCL 内存缓冲区未将正确的值传递给内核
【发布时间】：2020-05-02 20:14:43
【问题描述】：

我正在尝试通过编写一个简单的程序来学习 OpenCL，以添加点尺寸减法的绝对值。当我写完代码后，输出似乎错误，所以我决定在代码和内核中集成一些 printf 来验证所有变量是否正确传递给内核。通过这样做，我了解到输入变量没有正确发送到内核，因为打印它们会返回不正确的数据（准确地说是全零）。我尝试将数据类型从 uint8 更改为 int，但这似乎没有任何效果。如何正确地将 uint8 变量发送到 OpenCL 中的内存缓冲区？我似乎真的无法确定我在编写和发送内存缓冲区时做错了什么，因此它们显示不正确，并希望得到任何意见、建议或帮助。

提前谢谢你。

编辑：问题现已解决。我根据评论和答案部分提供的反馈更新了下面的代码。非常感谢！

代码如下：

#include <iostream>
#include <chrono>
#include <CL/cl.hpp>
#include <stdio.h>
#include <stdlib.h>

using namespace std;
#define USE_PLATFORM_NR  0

#define SIZE 100*1024*1024UL

//SAD DEFINES
#define NUM_DIM_SAD         5
#define NUM_POINTS_SAD      10
//#define NUM_LOOPS_SAD       20 
#define SAD_SEED            2014
//NUM_LOOPS * NUM_POINTS should be 75M

//SSD DEFINES
#define NUM_DIM_SSD         128
#define NUM_POINTS_SSD      150000
//#define NUM_LOOPS_SSD       1000
#define SSD_SEED            2048
//NUM_LOOPS * NUM_POINTS should be 150M


// Threadblock sizes (e.g. for kernels )
#define TS 5

// =================================================================================================

// Set the kernel as a string
const char* kernelstring =
"__kernel void SAD(const int num_points_sad, const int num_dim_sad,"
"                      const global unsigned char* m1_set,"
"                      const global unsigned char* m2_set,"
"                      global unsigned char* sad_gpu) {"
"    const int Point = get_global_id(0);"
"    unsigned char acc = 0;"
"    printf(\" POINT: %d \\n \", Point); "
"    for (int s=0; s<num_dim_sad ; s++) {"
"        printf(\"GPU: i = %d | m1_set = %d| m2_set = %d \\n \",Point*num_dim_sad + s,m1_set[Point*num_dim_sad+s],m2_set[Point*num_dim_sad+s]);}"
"    for (int k=0; k<num_dim_sad; k++) {"
"        acc += abs( m1_set[Point*num_dim_sad + k] - m2_set[Point*num_dim_sad + k] );"
"    }"
"    printf(\"ACC: %d \\n \",acc);"
"    sad_gpu[Point] = acc;"
"}";


// =================================================================================================

// Matrix-multiplication using a custom OpenCL SGEMM kernel.
int main() {

    cout << "Computing naive SAD & SSD for result checking" << endl;
    //naive implementation on CPU for result checking
    uint8_t* m1_set;// [NUM_POINTS][NUM_DIM];
    uint8_t* m2_set;// [NUM_POINTS][NUM_DIM];

    m1_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
    m2_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);

    uint8_t* sad;    //   [NUM_POINTS];
    uint8_t* sad_gpu;//   [NUM_POINTS];
    sad =     (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
    sad_gpu = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);

    srand(SAD_SEED);
    for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
    {
        sad[i/NUM_DIM_SAD] = 0;
        m1_set[i] = rand() / (uint8_t)RAND_MAX;
        m2_set[i] = rand() / (uint8_t)RAND_MAX;
        cout << "CPU: i = " << i << "| m1_set = " << (unsigned int)m1_set[i] << "| m2_set = " << (unsigned int)m2_set[i] << endl;
    }

    for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
         sad[i/NUM_DIM_SAD] += abs(m1_set[i] - m2_set[i]);

    cl_int err;

    // Configure the OpenCL environment
    printf(">>> Initializing OpenCL...\n");
    cl_platform_id platform = USE_PLATFORM_NR;
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetPlatformId"; return -1;}
    cl_device_id device = 0;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetDeviceIDs"; return -1; }
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateContext"; return -1; }
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateCommandQueue"; return -1; }
    char deviceName[1024];
    err = clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetDeviceInfo"; return -1; }
    cl_event event = NULL;

    // Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &kernelstring_sad, NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateProgramWithSource"; return -1; }
    err = clBuildProgram(program, 0, NULL, "", NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clBuildProgram"; return -1; }


    // Check for compilation errors
    size_t logSize;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo"; return -1; }
    char* messages = (char*)malloc((1 + logSize) * sizeof(char));
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo2"; return -1; }
    messages[logSize] = '\0';
    if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); }
    free(messages);


    // Prepare OpenCL memory objects
    cl_mem buf_m1 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m1"; return -1; }
    cl_mem buf_m2 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m2"; return -1; }
    cl_mem buf_sad = clCreateBuffer(context, CL_MEM_READ_WRITE, NUM_POINTS_SAD * sizeof(uint8_t), NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_sad"; return -1; }

    // Copy matrices to the GPU
    err = clEnqueueWriteBuffer(queue, buf_m1, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m1_set, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m1"; return -1; }
    err = clEnqueueWriteBuffer(queue, buf_m2, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m2_set, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m2"; return -1; }
    err = clEnqueueWriteBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_sad"; return -1; }

    // Configure the kernel and set its arguments
    int num_points_sad = NUM_POINTS_SAD;
    int num_dim_sad =    NUM_DIM_SAD;
    cl_kernel kernel = clCreateKernel(program, "SAD", &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel"; return -1; }
    err = clSetKernelArg(kernel, 0, sizeof(int), (void*)&num_points_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg0"; return -1; }
    err = clSetKernelArg(kernel, 1, sizeof(int), (void*)&num_dim_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg1"; return -1; }
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buf_m1);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg2"; return -1; }
    err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buf_m2);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg3"; return -1; }
    err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&buf_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel4"; return -1; }

    // Start the timed loop
    printf(">>> Starting SAD GPU run...\n");
    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

 //   const size_t local[1] = { TS };
    const size_t global[1] = { NUM_POINTS_SAD };
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, &event); //local
    if (err != CL_SUCCESS) { cout << err << "clEnqueueNDRangeKernel"; return -1; }
    // Wait for calculations to be finished
    clWaitForEvents(1, &event);

    // End the timed loop
    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

    // Copy the output matrix C back to the CPU memory
    clEnqueueReadBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
    auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
    std::cout << "Time difference = " << us << " us " << std::endl;
    // Free the OpenCL memory objects
    clReleaseMemObject(buf_m1);
    clReleaseMemObject(buf_m2);
    clReleaseMemObject(buf_sad);

    // Clean-up OpenCL 
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseProgram(program);
    clReleaseKernel(kernel);

    for (int i = 0; i < NUM_POINTS_SAD; i++)
    {
        cout << "i: " << i;
        cout << " | CPU: " << (unsigned int)sad[i];
        cout << " | GPU: " << (unsigned int)sad_gpu[i];
        cout << endl;
    }
    // Free the host memory objects
    free(m1_set);
    free(m2_set);
    free(sad);
    free(sad_gpu);

    // Exit
    return 0;
}

【问题讨论】：

查看cl*函数的返回码，答案很可能就在那里。
@doqtor 感谢您的回复。我检查了clEnqueueNDRangeKernel 之前使用的所有 cl* 函数的返回码，它们都返回 0。
你能告诉我们你是怎么做的吗？您可以使用固定代码更新您的问题。
@doqtor 我已经更新了上面的代码。检查输出流，没有“错误！”已打印。
为什么在c++代码中使用malloc？为什么不对数组使用std::vector？

标签： c++ c opencl

【解决方案1】：

在创建上下文的函数中存在错误 - 参数之一在错误的位置传递。

改为：

cl_context context = clCreateContext(NULL, 1, &device, NULL, &err, NULL);

应该是：

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
                                                             ^^^^^^^^^^

此外，错误的输出方式仍然没有多大帮助。应该是这样的：

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
    cout << err << "clCreateContext";
    return -1;
}

这样我们在错误发生时停止代码执行，并且我们知道它发生在哪个函数上。

======= 更新 ======================================== =================

内核中使用了错误的类型：OpenCL 中的uint8 类型是向量类型，表示int 类型的8 个值的数组。

要解决此问题，请在 OpenCL 内核中使用 uchar/unsigned char 类型，它等效于 c++ 中的 uint8_t/unsigned char。

请参阅 OpenCL data types 和 Scalar data types。

【讨论】：

再次感谢您的建议。我根据您的建议更改了错误的输出方式，并修复了err 参数的位置，因此也更新了上面的代码。不幸的是，我仍然没有收到任何 OpenCL 错误，因此代码继续执行到最后，显示错误结果。（如果控制台输出有帮助，和以前一样，我就留在这里link）
是的，你是对的！更新的答案解决了我的问题！我已将问题标记为已解决（如果我做错了，请告诉我修复它）。再次感谢您抽出宝贵时间回答我的问题并提供持续的反馈，祝您一切顺利！