为什么在减少时使用寄存器内存比共享内存慢？答案

【问题标题】：Why is using register memory slower than shared memory when doing reduction?为什么在减少时使用寄存器内存比共享内存慢？
【发布时间】：2021-11-01 12:36:58
【问题描述】：

我评估了两个内核性能：

#include <chrono>
#include <cuda_runtime.h>
#include <stdio.h>

void initData_int(int *p, int size){
    for (int t=0; t<size; t++){
        p[t] = (int)(rand()&0xff);
    }
}

__global__ void reduceShfl(int *in, int* out, int size)
{
    extern __shared__ int smem[];
    int tid = threadIdx.x;
    int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
    smem[tid] = 0;
    if (tid>=size) return;
    int tmp = 0; 
    if (idx + blockDim.x*3 <= size){
        int a = in[idx];
        int b = in[idx+blockDim.x];
        int c = in[idx+2*blockDim.x];
        int d = in[idx+3*blockDim.x];
        tmp = a + b + c + d;
    }
    smem[tid] = tmp;
    __syncthreads();

    if (blockDim.x >= 1024 && tid < 512){
        smem[tid] += smem[tid + 512];
    }
    __syncthreads();
    if (blockDim.x >= 512 && tid < 256){
        smem[tid] += smem[tid + 256];
    }
    __syncthreads();
    if (blockDim.x >= 256 && tid < 128){
        smem[tid] += smem[tid + 128];
    }
    __syncthreads();
    if (blockDim.x >= 128 && tid < 64){
        smem[tid] += smem[tid + 64];
    }
    __syncthreads();

    if (blockDim.x >= 64 && tid < 32){
        smem[tid] += smem[tid + 32];
    }
    __syncthreads();
    
    int tmpsum = smem[tid]; 
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
    if (tid==0)
        out[blockIdx.x] = tmpsum;
}

__global__ void reduceShmUnroll(int *in, int *out, int num)
{
    extern __shared__ int smem[];
    int tid = threadIdx.x;
    int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
    
    if (tid >= num) return;
    int tmp=0;
    if(idx + blockDim.x*3 <= num)
    {
        int a = in[idx];
        int b = in[idx + blockDim.x];
        int c = in[idx + blockDim.x*2];
        int d = in[idx + blockDim.x*3];
        tmp = a + b + c + d;
    }   
    smem[tid] = tmp;
    __syncthreads();

    if (blockDim.x >= 1024 && tid < 512){
        smem[tid] += smem[tid + 512];
    }
    __syncthreads();
    if (blockDim.x >= 512 && tid < 256){
        smem[tid] += smem[tid+256];
    }
    __syncthreads();
    if (blockDim.x >= 256 && tid < 128){
        smem[tid] += smem[tid+128];
    }
    __syncthreads();
    if (blockDim.x >= 128 && tid < 64){
        smem[tid] += smem[tid+64];
    }
    __syncthreads();
    if (tid < 32){
        volatile int *vsmem = smem;
        vsmem[tid] += vsmem[tid+32];
        vsmem[tid] += vsmem[tid+16];
        vsmem[tid] += vsmem[tid+8];
        vsmem[tid] += vsmem[tid+4];
        vsmem[tid] += vsmem[tid+2];
        vsmem[tid] += vsmem[tid+1];
    }

    if (tid == 0) out[blockIdx.x] = smem[0];
}

int main(int agrc, char **argv)
{
    int size = 1<<24;
    int nBytes = size*sizeof(int);
    int *a_h = (int*)malloc(nBytes);
    initData_int(a_h, size);

    int blocksize = 1024;
    int gridsize = (size-1)/blocksize+1;
    dim3 block(blocksize, 1);
    dim3 grid((size-1)/blocksize+1, 1);
    int *a_d, *b_d;
    cudaMalloc((int**)&a_d, nBytes);
    cudaMalloc((int**)&b_d, grid.x*sizeof(int));
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    int *tmp = (int*)malloc(gridsize*sizeof(int));
    memset(tmp, 0, grid.x/4);
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    auto s_0 = std::chrono::system_clock::now();
    reduceShfl<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
    cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    int res_1 = 0;
    for (int i=0; i<grid.x/4; i++){
        res_1 += tmp[i];
    }
    auto e_0 = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = e_0 - s_0;
    printf("Result from reduceShfl is: %d and time cost is %2f.\n", res_1, diff.count());

    memset(tmp, 0, grid.x/4);
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    s_0 = std::chrono::system_clock::now();
    reduceShmUnroll<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
    cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    int res_0 = 0;
    for (int i=0; i<grid.x/4; i++){
        res_0 += tmp[i];
    }
    e_0 = std::chrono::system_clock::now();
    diff = e_0 - s_0;
    printf("Result from reduceShmUnroll is: %d and time cost is %2f.\n", res_0, diff.count());

    cudaFree(a_d);
    cudaFree(b_d);
    free(a_h);
    free(tmp);
    return 0;
}

主要区别在于最后的warp减少，reduceShmUnroll使用共享内存，reduceShfl进行warp shuffle，使用寄存器内存。但是我发现reduceShfl比reduceShmUnroll慢。

Result from reduceShfl is: 2139353471 and time cost is 0.000533.
Result from reduceShmUnroll is: 2139353471 and time cost is 0.000485.

我的代码有问题吗？

【问题讨论】：

要更准确地测量内核执行时间，最好使用CUDA Events。此外，在运行内核时序测量之前，最好对设备进行一些预热。在新上下文中执行的第一个内核调用可能会比其他情况下运行得稍微慢一些。即使那样，差异也很小，因此您可能希望查看 10 个或更多相同内核调用的最佳和平均时间。

标签： cuda

【解决方案1】：

我的代码有问题吗？

是的，我会说你的代码有问题。

我看到的主要问题是您进行了无效的比较。在您的共享内存内核中，您将最后一个 warp 减少活动限制为最后一个 warp。在 shuffle 内核中，你不是：

共享内存内核：

__syncthreads();
if (tid < 32){  // this is missing from your shuffle kernel
    volatile int *vsmem = smem;
    vsmem[tid] += vsmem[tid+32];
    vsmem[tid] += vsmem[tid+16];
    vsmem[tid] += vsmem[tid+8];
    vsmem[tid] += vsmem[tid+4];
    vsmem[tid] += vsmem[tid+2];
    vsmem[tid] += vsmem[tid+1];
}

洗牌内核：

__syncthreads();

int tmpsum = smem[tid]; 
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
if (tid==0)
    out[blockIdx.x] = tmpsum;

当我以与共享 mem 内核相同的方式限制您的 shuffle 内核时（这样不必要的扭曲不会做不必要的工作），然后我观察到两个内核之间的运行时间大致相等（大约 1% 的差异），当我在 V100 上使用 nvprof 的配置文件：

                0.38%  222.76us         1  222.76us  222.76us  222.76us  reduceShmUnroll(int*, int*, int)
                0.37%  220.55us         1  220.55us  220.55us  220.55us  reduceShfl(int*, int*, int)

这正是我所期望的。对于这种有限的使用，没有理由认为共享内存使用或随机播放会更快或更慢。

共享内存活动和 warp shuffle 活动都有吞吐量限制。因此试图预测哪个会更快是困难的，因为它取决于你的代码中发生的其他事情。如果您的代码受共享内存吞吐量的限制，并且您将其中的一些活动转换为 warp shuffle，那么您可能会看到 warp shuffle 的好处。可以在另一个方向上做出相同的陈述。对于此特定代码的此特定部分，当正确/可比较/等效地编写时，您不会受到共享内存吞吐量或 warp shuffle 吞吐量的不同约束，因此正确的期望是性能没有差异，替换一个为另一个。

【讨论】：

我添加了您推荐的行。 warp shuffle 比前一个更快。但仍然比共享内存慢。我相信这是由硬件（我正在使用的 2080 ti）和代码活动决定的。
谢谢罗伯特。您是否有推荐的相关材料来判断应用程序是否受共享内存吞吐量或 warp shuffle 吞吐量的约束？
学习使用分析器。这不是cmets可以涵盖的东西。您可以开始使用分析器 here 和 here。