为什么这个 cuda 内核会产生不确定的结果？答案

【问题标题】：Why does this cuda kernel yield non-deterministic results?为什么这个 cuda 内核会产生不确定的结果？
【发布时间】：2016-02-28 16:34:19
【问题描述】：

我在更大的代码示例中构建了一个我面临的问题的最小示例。在这个例子中，我想找到一些数据ys 到函数fs 的平方和误差，但我想一次在多个函数上做，所以我创建fs 作为矩阵。原始数据的长度为gridSize，我想一次对nGrids 函数执行此成本函数，因此fs 的大小为nGrids*gridSize。

我发现 CUDA 内核以不确定的方式给出了不可靠的结果，这让我相信我没有正确执行我的线程（这是我的第一个 CUDA 内核！）。我在这个程序上运行了cuda-memcheck，它没有显示任何错误。

为了测试这些错误的偶发性质，我编写了一个脚本来运行它 100 次，并比较结果随机关闭的频率。我发现当gridSize 增长时它关闭的可能性更大：

gridSize ...  Errors
     300 ...   0/100
     400 ...   0/100
     450 ...   4/100
     500 ...   5/100
     550 ...  55/100
     600 ...  59/100
     650 ... 100/100

这里的想法是让每个块在一个网格上工作，当我想提高并行度时只需调用多个 CUDA 块。因此，我在这里称其为 12 个块，因为有 12 个网格。对于这段代码，我永远不会拥有超过 1000 个的 gridSize，所以我将把 Nthreads 留在 1024（因为我的 NVIDIA GTX 770 上每个块有 1024 个线程）。

代码如下：

#include <stdio.h>

#define nGrids 12
#define gridSize 700

void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs);
void D_get_costs(float* h_xs, float* h_ys, float* h_fs, float* d_costs);

/**************\
 * cuda Costs *
\**************/
__global__ void cuCosts(float* d_xs, float* d_ys, float* d_fs, float* d_costs) {
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
        __syncthreads();
        // reduction
        for(int s=1; s < blockDim.x; s*=2) {
            if( ir%(2*s) == 0 && ir+s < gridSize){
                diff[ir] += diff[ir+s];
            }
        }
        __syncthreads();
        d_costs[ig] = diff[0];
    }
    __syncthreads();
}


/****************\
 * Main routine *
\****************/
int main(int argc, char** argv) {

    float h_xs[gridSize];
    float h_ys[gridSize];
    float h_fs[gridSize*nGrids];

    for( int ir = 0; ir < gridSize; ir++) {
        h_xs[ir] = (float)ir/10.0;
        h_ys[ir] = (float)ir/10.0;
    }

    for(int ir = 0; ir < gridSize; ir++) {
        for(int jgrid = 0; jgrid < nGrids; jgrid++) {
            float trand = 2.0*((float)rand()/(float)RAND_MAX) - 1.0;
            h_fs[jgrid*gridSize + ir] = h_ys[ir] + trand;
        }
    }

    float h_costs[nGrids];
    float d_costs[nGrids];

    // get all of the costs (on the host)
    H_get_costs(h_xs, h_ys, h_fs, h_costs);

    // get all of the costs (on the device)
    D_get_costs(h_xs, h_ys, h_fs, d_costs);

    // Print the grids
    /*
    for(int ir = 0; ir < gridSize; ir++) {
        printf("%10.5e %15.5e", h_xs[ir], h_ys[ir]);
        for(int jg = 0; jg < nGrids; jg++) {
            printf("%15.5e", h_fs[jg*gridSize + ir]);
        }
        printf("\n");
    }
    */

   // print the results
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Host ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", h_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Device ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Difference ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]-h_costs[ig]);
    }
    printf("\n");

    return 0;
}

/*******************************\
 * get the costs (on the host) *
\*******************************/
void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs) {
    for(int ig = 0; ig < nGrids; ig++) { h_costs[ig] = 0.0; }
    for(int ir = 0; ir < gridSize-1; ir++) {
        for(int ig = 0; ig < nGrids; ig++) {
            h_costs[ig] += (h_ys[ir] - h_fs[ig*gridSize + ir])*(h_ys[ir] - h_fs[ig*gridSize + ir]);
        }
    }
}

/**************************\
 * wrapper for cuda costs *
\**************************/
void D_get_costs(float* h_xs_p, float* h_ys_p, float* h_fs_p, float* r_costs) {
    float* d_xs;
    float* d_ys;
    float* d_fs;

    float* d_costs; // device costs
    float* t_costs; // temporary costs

    cudaMalloc( (void**)&d_xs, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_ys, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_fs, nGrids*gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_costs, nGrids*sizeof(float) );

    t_costs = (float*)malloc(nGrids*sizeof(float));

    cudaMemcpy( d_xs, h_xs_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_ys, h_ys_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_fs, h_fs_p, nGrids*gridSize*sizeof(float), cudaMemcpyHostToDevice);

    int Nthreads = 1024;
    int Nblocks = nGrids;

    cuCosts<<<Nblocks, Nthreads>>>(d_xs, d_ys, d_fs, d_costs);

    cudaMemcpy( t_costs, d_costs, nGrids*sizeof(float), cudaMemcpyDeviceToHost);

    for(int ig = 0; ig < nGrids; ig++) {
        r_costs[ig] = t_costs[ig];
    }

    cudaFree( d_xs );
    cudaFree( d_ys );
    cudaFree( d_fs );
}

如果重要的话，这里是我的硬件规格：

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 770"
  CUDA Driver Version / Runtime Version          6.0 / 5.5
  CUDA Capability Major/Minor version number:    3.0
  Total amount of global memory:                 2047 MBytes (2146762752 bytes)
  ( 8) Multiprocessors, (192) CUDA Cores/MP:     1536 CUDA Cores
  GPU Clock rate:                                1084 MHz (1.08 GHz)
  Memory Clock rate:                             3505 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 524288 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX 770
Result = PASS

【问题讨论】：

__syncthreads() 应该被块中的所有线程调用，否则你会得到 UB。

标签： cuda

【解决方案1】：

您的内核代码有多个导致问题的同步问题。一方面，围绕 __syncthreads() 调用存在分支，这是 CUDA 中未定义的行为。那么你在减少循环中缺少一个同步点，这意味着扭曲到扭曲的累积是不正确的。像这样的：

__global__ void cuCosts(float* d_xs, float* d_ys, 
                        float* d_fs, float* d_costs)
{
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
    }
    __syncthreads();

    // reduction
    for(int s=1; s < blockDim.x; s*=2) {
        if( ir%(2*s) == 0 && ir+s < gridSize){
            diff[ir] += diff[ir+s];
        }
        __syncthreads();
    }
    d_costs[ig] = diff[0];
}

应该可以正常工作[免责声明，在浏览器中编写，未经测试，使用风险自负]

【讨论】：