Cuda 矩阵乘法——不适用于某些非方阵答案

【问题标题】：Cuda Matrix Multiplication -- not working for some non-square matricesCuda 矩阵乘法——不适用于某些非方阵
【发布时间】：2013-12-11 01:30:32
【问题描述】：

我正在尝试使用 Cuda 编程。作为其中的一部分，我正在尝试开发一种在 GPU 上运行的矩阵乘法算法。该算法适用于方阵，但不适用于 非方阵 矩阵。这是我的内核

    float* multiply_gpu(float* matrix1 , float* matrix2);
    __global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a,
            float *b, float *result) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int result_size = rowsA*columnsB;
        int value = 0;//the final result
        //indices of values from input matrices
        if (index < result_size) {
            int index1 = (index/rowsA)*rowsA; //get nearest row
            int index2 = index%columnsB; //get start column
            int k = 0;
            while (k<columnsA) { //columnsA == rowsB
               value += a[index1]*b[index2]; //v = sum a_ik * b_kj
               index1 ++;
               index2 += columnsB;
               k++;
            }
            result[index] = value;
        }
    }

在与我的主管进行了简短的健全性检查后，他没有发现任何问题。我相信问题出在这个函数上：

float* multiply_gpu(float* matrix1 , float* matrix2) {
    //the dimensions of the matrices
    size_t available, total;
    cudaError_t error;
    cudaError err = cudaMemGetInfo(&available, &total);
    if(err != cudaSuccess){
        printf("There was an error: %s\n", cudaGetErrorString(err));
    }
    int height1 = matrix1[0];
    int width1 = matrix1[1];
    int height2 = matrix2[0];
    int width2 = matrix2[1];
    if (width1!=height2) {
        return NULL;
    }
    //this array contains the result of the operation
    float* result = (float *) malloc(height1*width2*sizeof(float));
    //pointers for device matrices
    float *d_matrix1;
    float *d_matrix2;
    float *d_result;
    //allocate memory for matrices
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //now copy matrices onto device -- note the offset of 2
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result); 
    printf("%d %d %d %d\n",height1,width1,height2,width2);
    //make the host block until mult is finished running
    //printf("finished multiplying\n");
    cudaDeviceSynchronize();
    //copy result back
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //free now unneeded cuda memory
    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    cudaFree(d_result);
    printf("GOT RESULT\n");
    for (int i=0;i<height1*width2;i++) {
        printf("%f ",result[i]);
    }
    printf("\n");
    //result ready to be returned
    return result;
}

请注意，作为 multiply_gpu 参数的矩阵的高度在索引 0 处，宽度在索引 1 处。结果矩阵没有此信息。

一个错误计算的例子：当我将以下数组输入 multiply_gpu - {2,3,1,2,3,4,5,6} ， {3,2,1,2,3,4 ,5,6} 答案应该是 {22,28,49,64} 但我的单元测试会生成 {22,28,40,52}。很近！请注意，对于 (1,2,3)*(1,2,3) （不是正方形）的点积，算法很满意......这里的错误可能是什么？感谢您的任何帮助。如果我独立找到解决方案，将发布解决方案。

【问题讨论】：

关于矩阵乘法的CUDA标签有不少问题。你有没有看过？如果您使用 cuda-memcheck 运行代码会发生什么？ SO 期望：“有关您编写的代码问题的问题必须在问题本身中描述具体问题 - 并包括有效的代码来重现它。请参阅 SSCCE.org 以获得指导。” 投票结束。您尚未提供 SSCCE.org 代码。
是的，矩阵乘法在 GPU 上很常见，并且有很多关于它的问题。我已经阅读了它们，但可能还不够彻底。我只是不知所措，来到这里进行理智检查。感谢您提供指向 SSCCE.org 的链接——我现在正在查看它。我也在学习 cuda-memcheck。总的来说，我面临的这个错误正在消耗我。我认为需要更多地关注我自己的代码，并且需要回顾一下其他矩阵乘法器。
我更新了我的答案，因为我的答案仍然不太正确。我认为现在是正确的 - 它适用于您提到的案例以及我尝试过的其他三个案例。

标签： matrix cuda gpu multiplication

【解决方案1】：

这一行是错误的：

        int index1 = (index/rowsA)*rowsA; //get nearest row

应该是这样的：

        int index1 = (index/columnsB)*columnsA; //get nearest row

为什么这个公式是正确的？ index1 用于索引A 中的行元素，这些元素对应于我们正在计算的输出矩阵位置指示的行。输出矩阵位置只是线程索引。如果我们（整数）将线程索引除以 output 矩阵中的 columns 的数量，即C，我们得到有问题的行号。然后，为了在A 中找到该行的第一个元素，我们乘以A 中的列数。这会正确地将我们索引到A 中相关行的第一个元素。

这是一个完整的应用程序以及我的测试用例 - 我对您的代码所做的唯一更改是上面指出的更改。

$ cat t290.cu
#include <stdio.h>

__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int result_size = rowsA*columnsB;
        int value = 0;//the final result
        //indices of values from input matrices
        if (index < result_size) {
            int index1 = (index/columnsB)*columnsA; //get nearest row
            int index2 = index%columnsB; //get start column
            int k = 0;
            while (k<columnsA) { //columnsA == rowsB
               value += a[index1]*b[index2]; //v = sum a_ik * b_kj
               index1 ++;
               index2 += columnsB;
               k++;
            }
            result[index] = value;
        }
    }

float* multiply_gpu(float* matrix1 , float* matrix2) {
    //the dimensions of the matrices
    size_t available, total;
    cudaError_t error;
    cudaError err = cudaMemGetInfo(&available, &total);
    if(err != cudaSuccess){
        printf("There was an error: %s\n", cudaGetErrorString(err));
    }
    int height1 = matrix1[0];
    int width1 = matrix1[1];
    int height2 = matrix2[0];
    int width2 = matrix2[1];
    if (width1!=height2) {
        printf("fail!\n");
        return NULL;
    }
    //this array contains the result of the operation
    float* result = (float *) malloc(height1*width2*sizeof(float));
    //pointers for device matrices
    float *d_matrix1;
    float *d_matrix2;
    float *d_result;
    //allocate memory for matrices
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //now copy matrices onto device -- note the offset of 2
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
    printf("%d %d %d %d\n",height1,width1,height2,width2);
    error = cudaGetLastError();
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //make the host block until mult is finished running
    //printf("finished multiplying\n");
    error = cudaDeviceSynchronize();
    if (error != cudaSuccess) {
        fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //copy result back
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //free now unneeded cuda memory
    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    cudaFree(d_result);
    printf("GOT RESULT\n");
    for (int i=0;i<height1*width2;i++) {
        printf("%f ",result[i]);
    }
    printf("\n");
    //result ready to be returned
    return result;
}

int main(){

  float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
  float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0};
  float *my_result1 = multiply_gpu(m2, m1);
  float m3[8] = {2,3,1,2,3,4,5,6};
  float m4[8] = {3,2,1,2,3,4,5,6};
  float *my_result2 = multiply_gpu(m3, m4);
  float *my_result3 = multiply_gpu(m4, m3);
  float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1};
  float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
  float *my_result4 = multiply_gpu(m5, m6);
  return 0;
}

$ nvcc -arch=sm_20 -o t290 t290.cu
t290.cu: In function âfloat* multiply_gpu(float*, float*)â:
t290.cu:30: warning: converting to âintâ from âfloatâ
t290.cu:31: warning: converting to âintâ from âfloatâ
t290.cu:32: warning: converting to âintâ from âfloatâ
t290.cu:33: warning: converting to âintâ from âfloatâ
$ cuda-memcheck ./t290
========= CUDA-MEMCHECK
2 2 2 3
GOT RESULT
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000
2 3 3 2
GOT RESULT
22.000000 28.000000 49.000000 64.000000
3 2 2 3
GOT RESULT
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000
2 5 5 4
GOT RESULT
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
========= ERROR SUMMARY: 0 errors
$

【讨论】：

嗯，不完全是。但你在正确的轨道上。我自己发现了正确的答案并发布了它。

【解决方案2】：

所以在仔细查看我的矩阵代码后，我发现了一个简单的问题我的操作的数学。

这句话确实是错的

 int index1 = (index/rowsA)*rowsA; //get nearest row

我注意到，由于我的矩阵是按行排序的，因此从 (i,j) 处的元素获取正确索引的公式是

index = i*rowLength + j

因此对 index1 的赋值应该是

int index1 = (index/rowsA)*columnsA

为什么？很明显，要导航到行 n 的索引，我们必须移动 n 行长度（这是矩阵中的列数）。我的代码适用于方形矩阵，但不适用于其他矩形矩阵，因为列数与此类矩阵中的行数不匹配。

【讨论】：

这是错误的。我提供了完整的代码。将您的公式插入我的完整代码中，它会给出不正确的结果。