通过增加占用率来提高内核性能？答案

【问题标题】：Improving kernel performance by increasing occupancy?通过增加占用率来提高内核性能？
【发布时间】：2011-12-05 23:17:25
【问题描述】：

这是我在 GT 440 上的内核的 Compute Visual Profiler 的输出：

内核详细信息：网格大小：[100 1 1]，块大小：[256 1 1]
寄存器比率：0.84375 (27648 / 32768) [每个线程 35 个寄存器]
共享内存比率：0.336914 ( 16560 / 49152 ) [5520 字节/ 阻止]
每个 SM 的活动块：3（每个 SM 的最大活动块：8）
每个 SM 的活动线程数：768（每个 SM 的最大活动线程数：1536）
潜在入住人数：0.5 (24 / 48)
占用限制因素：寄存器

请注意标为粗体的项目符号。内核执行时间为121195 us。

我通过将一些局部变量移动到共享内存来减少每个线程的寄存器数量。 Compute Visual Profiler 输出变为：

内核详细信息：网格大小：[100 1 1]，块大小：[256 1 1]
寄存器比率：1 (32768 / 32768) [每个线程 30 个寄存器]
共享内存比率：0.451823 ( 22208 / 49152 ) [5552 bytes per Block]
每个 SM 的活动块数：4（每个 SM 的最大活动块数：8）
每个 SM 的活动线程数：1024（每个 SM 的最大活动线程数：1536）
潜在入住人数：0.666667 (32 / 48)
占用限制因素：寄存器

因此，现在 4 块在单个 SM 上同时执行，而之前版本中的 3 块。但是，执行时间是115756 us，几乎是一样的！为什么？块不是完全独立地在不同的 CUDA 内核上执行吗？

【问题讨论】：

标签： cuda

【解决方案1】：

您隐含地假设更高的入住率会自动转化为更高的性能。大多数情况下并非如此。

NVIDIA 架构需要每个 MP 一定数量的活动扭曲，以隐藏 GPU 的指令流水线延迟。在您的基于 Fermi 的卡上，该要求转化为至少 30% 的入住率。争取比最小值更高的占用率不一定会导致更高的吞吐量，因为延迟瓶颈可能已经转移到 GPU 的另一部分。您的入门级 GPU 的内存带宽并不多，而且很有可能每个 MP 3 个块足以使您的代码内存带宽受到限制，在这种情况下，增加块的数量不会对性能产生任何影响（它甚至可能因为内存控制器争用和缓存未命中增加而下降）。此外，您说您将变量溢出到共享内存以减少内核的寄存器占用空间。在 Fermi 上，共享内存只有大约 1000 Gb/s 的带宽，而寄存器的带宽约为 8000 Gb/s（请参阅下面的链接以获取证明这一点的微基准测试结果）。因此，您将变量移到了较慢的内存中，这也可能对性能产生负面影响，抵消了高占用率带来的任何好处。

如果您还没有看过它，我强烈推荐 GTC 2010 中 Vasily Volkov 的演讲“在低占用率下实现更好的性能”(pdf)。这里展示了利用指令级并行性如何在非常非常低的占用率下将 GPU 吞吐量提高到非常高的水平。

【讨论】：

好答案。占用只是隐藏全局内存访问延迟的一个严重问题；对于计算密集型线程，每个 SP 几个活动线程就足够了。你也这么理解吗？
我不这么认为，帕特里克。这不适用于所有类型的内核。对于计算密集型内核，更高的占用率可能仍会提高性能。隐藏算术延迟需要多少活动扭曲并不是那么简单。这取决于操作的类型以及它们如何相互交错。

【解决方案2】：

talonmies 已经回答了你的问题，所以我只想分享一个受上述答案中提到的 V. Volkov 演示文稿第一部分启发的代码。

这是代码：

#include<stdio.h>

#define N_ITERATIONS 8192

//#define DEBUG

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x ;

    if (tid < N) {

        int a = d_a[tid];
        int b = d_b[tid];
        int c = d_c[tid];

        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a = a * b + c;
        }

        d_a[tid] = a;
    }

}

/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2) {

        int a1 = d_a[tid];
        int b1 = d_b[tid];
        int c1 = d_c[tid];

        int a2 = d_a[tid+N/2];
        int b2 = d_b[tid+N/2];
        int c2 = d_c[tid+N/2];

        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
        }

        d_a[tid]        = a1;
        d_a[tid+N/2]    = a2;
    }

}

/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/4) {

        int a1 = d_a[tid];
        int b1 = d_b[tid];
        int c1 = d_c[tid];

        int a2 = d_a[tid+N/4];
        int b2 = d_b[tid+N/4];
        int c2 = d_c[tid+N/4];

        int a3 = d_a[tid+N/2];
        int b3 = d_b[tid+N/2];
        int c3 = d_c[tid+N/2];

        int a4 = d_a[tid+3*N/4];
        int b4 = d_b[tid+3*N/4];
        int c4 = d_c[tid+3*N/4];

        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
            a3 = a3 * b3 + c3;
            a4 = a4 * b4 + c4;
        }

        d_a[tid]        = a1;
        d_a[tid+N/4]    = a2;
        d_a[tid+N/2]    = a3;
        d_a[tid+3*N/4]  = a4;
    }

}

/********/
/* MAIN */
/********/
void main() {

    const int N = 1024;

    int *h_a                = (int*)malloc(N*sizeof(int));
    int *h_a_result_host    = (int*)malloc(N*sizeof(int));
    int *h_a_result_device  = (int*)malloc(N*sizeof(int));
    int *h_b                = (int*)malloc(N*sizeof(int));
    int *h_c                = (int*)malloc(N*sizeof(int));

    for (int i=0; i<N; i++) {
        h_a[i] = 2;
        h_b[i] = 1;
        h_c[i] = 2;
        h_a_result_host[i] = h_a[i];
        for(unsigned int k = 0; k < N_ITERATIONS; k++) {
            h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
        }
    }

    int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
    int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
    int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));

    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));

    // --- Creating events for timing
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /***********/
    /* KERNEL0 */
    /***********/
    cudaEventRecord(start, 0);
    kernel0<<<1, N>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }

    /***********/
    /* KERNEL1 */
    /***********/
    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
    cudaEventRecord(start, 0);
    kernel1<<<1, N/2>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }

    /***********/
    /* KERNEL2 */
    /***********/
    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
    cudaEventRecord(start, 0);
    kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }

    cudaDeviceReset();

}

在我的 GeForce GT540M 上，结果是

kernel0   GFlops = 21.069281    Occupancy = 66%
kernel1   GFlops = 21.183354    Occupancy = 33%
kernel2   GFlops = 21.224517    Occupancy = 16.7%

这意味着如果 指令级并行 (ILP) 被利用，占用率较低的内核仍然可以表现出高性能。

【讨论】：