矩阵乘法返回不正确的结果 OpenCL答案

【问题标题】：Matrix multiplication returning incorrect result OpenCL矩阵乘法返回不正确的结果 OpenCL
【发布时间】：2018-08-16 14:31:37
【问题描述】：

下面的代码应该执行矩阵-矩阵乘法。使用 OpenCL。执行代码时，生成的矩阵 C 用零填充。这是不正确的，因为我将两个对角矩阵与非零对角条目相乘。

/*
Taken from http://gpgpu-computing4.blogspot.com/2009/09/matrix-multiplication-2-opencl.html

We have made some modifications, but need to understand what is going on.
*/

// Multiply two matrices A * B = C

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <CL\cl.h>
#include"OpenCLUtils.h"

/* WA : Width of A, HA : Width of A etc*/

#define WA 1024
#define HA 1024
#define WB 1024
#define LOCAL_WORK_SIZE 16
#define HB WA
#define WC WB
#define HC HA


void identityMultipliedByCoeff(float* data, int width, float coeff)
{
    for (int i = 0; i <width*width; ++i)
    {
        data[i] = 0.0;
    }
    for (int i = 0; i < width; ++i)
    {
        data[i*width + i] = 1.0*coeff;
    }
}

/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////

int main(int argc, char** argv)
{

    // set seed for rand()
    srand(2006);

    // 1. allocate host memory for matrices A and B
    unsigned int size_A = WA * HA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A = (float*)malloc(mem_size_A);

    unsigned int size_B = WB * HB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B = (float*)malloc(mem_size_B);

    // 2. initialize host memory
    identityMultipliedByCoeff(h_A, WA, 12.0);
    identityMultipliedByCoeff(h_B, WB, -9.0);

    // 4. allocate host memory for the result C
    unsigned int size_C = WC * HC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*)malloc(mem_size_C);

    // 5. Initialize OpenCL
    // OpenCL specific variables
    cl_device_id device;
    cl_context clGPUContext;
    cl_command_queue clCommandQue;
    cl_program clProgram;
    cl_kernel clKernel;

    size_t dataBytes;
    size_t kernelLength;
    cl_int errcode;

    // OpenCL device memory for matrices
    cl_mem d_A;
    cl_mem d_B;
    cl_mem d_C;

    /*****************************************/
    /* Initialize OpenCL */
    /*****************************************/
    /* Create a device and context */
    device = create_device();
    clGPUContext = clCreateContext(NULL, 1, &device, NULL, NULL, &errcode);
    if (errcode < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    // get the list of GPU devices associated 
    // with context
    errcode = clGetContextInfo(clGPUContext,
        CL_CONTEXT_DEVICES, 0, NULL,
        &dataBytes);
    cl_device_id *clDevices = (cl_device_id *)
        malloc(dataBytes);
    errcode |= clGetContextInfo(clGPUContext,
        CL_CONTEXT_DEVICES, dataBytes,
        clDevices, NULL);




    //Create a command-queue
    clCommandQue = clCreateCommandQueue(clGPUContext,clDevices[0], 0, &errcode);

    // Setup device memory
    /* We are passing the host memory as an argument. This is where
        the device memory obtains the data from the host memory. */
    d_C = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_C, NULL, &errcode);
    d_A = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,mem_size_A, h_A, &errcode);
    d_B = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,mem_size_B, h_B, &errcode);

    // 6. Load and build OpenCL kernel

    // Open the .cl file and load it
    // into a char* buffer
    FILE* fp = fopen("MatMul.cl", "r");
    fseek(fp, 0, SEEK_END);
    const size_t lSize = ftell(fp);
    rewind(fp);
    unsigned char* buffer;
    buffer = (unsigned char*)malloc(lSize);
    fread(buffer, 1, lSize, fp);
    fclose(fp);

    cl_int status;
    clProgram = clCreateProgramWithBinary(clGPUContext,
        1, (const cl_device_id *)clDevices,
        &lSize, (const unsigned char**)&buffer,
        &status, &errcode);

    errcode = clBuildProgram(clProgram, 0, NULL, NULL,
        NULL, NULL);

    errcode = clBuildProgram(clProgram, 0,
        NULL, NULL, NULL, NULL);

    clKernel = clCreateKernel(clProgram,
        "matrixMul", &errcode);

    // 7. Launch OpenCL kernel
    size_t localWorkSize[2], globalWorkSize[2];

    int wA = WA;
    int wC = WC;

    /* Set the arguments for the kernel. */
    errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_C);
    errcode |= clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_A);
    errcode |= clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_B);
    errcode |= clSetKernelArg(clKernel, 3,sizeof(int), (void *)&wA);
    errcode |= clSetKernelArg(clKernel, 4,sizeof(int), (void *)&wC);

    /* Rember that in OpenCL we need to express the
        globalWorkSize in terms of the total number of threads.
        The underlying OpenCL API will look at the 
        globalWorkSize and divide by the localWorkSize to 
        arrive at a 64 by 64 NDRange of 16 by 16 work groups. */

    localWorkSize[0] = LOCAL_WORK_SIZE;
    localWorkSize[1] = LOCAL_WORK_SIZE;
    globalWorkSize[0] = WA;
    globalWorkSize[1] = HA;

    errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 2, NULL, globalWorkSize,localWorkSize, 0, NULL, NULL);


    //
    // The calculation has now been carried out
    //


    // 8. Retrieve result from device
    errcode = clEnqueueReadBuffer(clCommandQue,
        d_C, CL_TRUE, 0, mem_size_C,
        h_C, 0, NULL, NULL);

    // We must check the result

    for (int i = 0; i < WA; i++)
    {
        for (int j = 0; j < WA; j++)
        {
            float prod = 0;
            for (int k = 0; k < WA;k++)
            {
                prod += h_A[i*WA + k] * h_B[k*WA + j];
            }
            if (fabs(h_C[i*WA+j] - prod) > 0.01)
            {
                printf("The indices where the comparison failed, i = %d, j = %d\n", i,j);
                printf("C[i*WA+j] should equal %f\n", prod);
                printf("C[i*WA+j] = %f\n", h_C[i*WA + j]);
                perror("The matrix check has failed");
                exit(1);
                break;
            }

        }
    }
    printf("The matrix check has been successfull!\n");


    // 10. clean up memory
    free(h_A);
    free(h_B);
    free(h_C);

    clReleaseMemObject(d_A);
    clReleaseMemObject(d_C);
    clReleaseMemObject(d_B);

    free(clDevices);
    clReleaseContext(clGPUContext);
    clReleaseKernel(clKernel);
    clReleaseProgram(clProgram);
    clReleaseCommandQueue(clCommandQue);

}

头文件 OpenCLUtils.h 的 .cpp 文件如下所示：

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include<CL\cl.h>

cl_device_id create_device() {

    cl_platform_id* platforms;
    cl_uint num_platforms,num_devices;
    cl_device_id* devices;
    char platform_name_data[50];
    char name_data[50];
    cl_int i,err,platformchoice;

    platformchoice =1;

    /* Find out how many platforms there are */
    err = clGetPlatformIDs(1, NULL, &num_platforms);
    if (err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Reserve memory for platforms*/
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);

    /* Obtain the available platforms and store them in the array platforms */
    clGetPlatformIDs(num_platforms, platforms, NULL);

    /* We want to know the names of the platforms.
    This will the inform us and lead to a
    cannonical choice for 'platformchoice'.*/

    for (i = 0; i < num_platforms; i++)
    {
        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name_data), platform_name_data, NULL);
        if (err < 0)
        {
            perror("Unable to obtain information about platform");
        }
        printf("%s\n", platform_name_data);
    }

    printf("\nSearching %s for available devices...\n", platform_name_data);

    /* Obtain the number of GPUS available on this platform */
    err = clGetDeviceIDs(platforms[platformchoice], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    if (err == CL_DEVICE_NOT_FOUND) 
    {
        perror("No GPU devices available");
        exit(1);
    }
    if (err < 0) {
        perror("Could not access any devices. Not as a result of the device not being found. Debug for error code");
        exit(1);
    }
    /* Reserve memory for devices */
    devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);

    /* Populate devices with devices compatible with the chosen platform */
    clGetDeviceIDs(platforms[platformchoice], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);

    for (i = 0; i < num_devices; i++)
    {
        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
        if (err < 0)
        {
            perror("Unable to obtain information about device");
        }
        printf("%s\n", name_data);
    }
    printf("\n");
    return devices[0];
}

我使用的是 AMD GPU。它是 R9 380。上面的“实用程序”代码允许我选择平台，然后选择支持该平台的设备。不管我选择intel的平台还是AMD的平台，结果都是一样的。

矩阵乘法核在

中给出

http://gpgpu-computing4.blogspot.com/2009/10/matrix-multiplication-3-opencl.html

我的 VisualStudio 调试器的打印屏幕是

enter image description here

它显示状态等于-42，这意味着根据https://streamhpc.com/blog/2013-04-28/opencl-error-codes/ CL_INVALID_BINARY

编辑：

我试过了

...
// 6. Load and build OpenCL kernel

// Obtain size of source file
FILE* fp = fopen("MatMul.cl", "r");
fseek(fp, 0, SEEK_END);
const size_t lSize = ftell(fp);
rewind(fp);

// Read file content into buffer
unsigned char* buffer = (unsigned char*)malloc(lSize+1);
buffer[lSize] = '\0';
fread(buffer, sizeof(char), lSize, fp);
fclose(fp);

//create program from buffer
clProgram = clCreateProgramWithSource(clGPUContext,1,(const char**)&buffer,&lSize, &errcode);

errcode = clBuildProgram(clProgram, 1,&device, NULL,NULL, NULL);

clKernel = clCreateKernel(clProgram,
    "matrixMul", &errcode);
...

但是现在得到错误代码-11 CL_BUILD_PROGRAM _FAILURE

最终编辑：

我开始工作了

这是修正后的内核

/* Matrix multiplication: C = A * B.
 * Device code.
 */

// Thread block size
#define BLOCK_SIZE 16

//////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
//////////////////////////////////////////////////////
__kernel void
matrixMul(__global float* C, 
          __global float* A, 
          __global float* B, int wA, int wB)
{

    float Csub=0;   <<<<<<< THIS WAS MISSING IN THE BLOG

    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);

    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);

    // Index of the first sub-matrix of A processed 
    // by the block
    int aBegin = wA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed 
    // by the block
    int aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed 
    // by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) 
    {

        // Declaration of the local memory array As 
        // used to store the sub-matrix of A
        __local float As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the local memory array Bs 
        // used to store the sub-matrix of B
        __local float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from global memory
        // to local memory; each thread loads
        // one element of each matrix
        As[ty][tx] = A[a + wA * ty + tx];
        Bs[ty][tx] = B[b + wB * ty + tx];

        // Synchronize to make sure the matrices 
        // are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += As[ty][k] * Bs[k][tx];

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);

    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    C[c + wB * ty + tx] = Csub;

}

【问题讨论】：

标签： matrix opencl

【解决方案1】：

我认为错误在于这一行：

 d_C = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_A, NULL, &errcode);

第三个参数应该是mem_size_C

编辑：另外，出于性能考虑，我建议您使用 clEnqueueWriteBuffer 复制矩阵 A 和 B。进一步阅读 here

【讨论】：

我试过了，但没有解决问题。我更新了代码，因为用于识别代码是否成功的代码也有点偏离。
好的。您介意在内核代码中添加 printf 语句以查看 A 和 B 是否正确复制到设备上吗？在此期间，我会仔细阅读您的代码。
我会这样做的。谢谢:)
另一个有帮助的健全性检查是在您调用 cl 函数时检查 errcode 的值每次。事实上，在尝试其他任何事情之前，您可能应该这样做
我终于让它工作了。我错过了构建日志中围绕内核的错误。我已经在描述中包含了最终版本。谢谢你的帮助。因为你，我今天学到了很多关于 OpenCL 的知识！谢谢！