CUDA 中的每计数指令 (IPC) 和指令级并行 (ILP)答案

Profiler 测量

结果是在 GTX Titan 上测量的。
检查cubin 输出以确保在优化期间没有消除任何指令。
Executed IPC 与发布的IPC 几乎相同，所以我只列出其中一个。

添加指令（XORs 具有相同的行为）

             | ILP 1  | ILP 2   | ILP 4  | ILP 8
--------------------------------------------------
 IPC         |   4.00 |   3.32  |   2.72 |   3.44
--------------------------------------------------
 Issue Slot  | 99.17% | 59.34%  | 48.61% | 61.71%
 Utilization |        |         |        |

我预计 ILP 2、4 和 8 会提供更好的性能，但不是。
回想整数吞吐量是160。每个 SM 的 4 warp 调度程序应该在每个周期双重发出高达 5 的指令，因此 IPC 应该上升到 5。我该如何解释我所观察到的？为什么IPC = 4 时问题槽被利用了 99%？

Float / Int ADD 指令组合

如果我将ILP 4 的代码修改为两个int ADDs 和两个float ADDs：

IPC: 5.1
Issue slot utilization: 99.12%

奇怪的是，warp 调度器似乎在发出浮动操作方面做得更好。

讨论

现有文献建议使用ILP 有助于达到浮点运算的峰值性能。为什么ILP 不适用于整数？我该如何对整数运算执行此操作？
理论上，我的内核应该对每个候选者进行2.25 整数运算。这与我在cuobjdump 中观察到的一致。有 2^48 候选者，所以 GTX Titan 上的最小运行时间应该是 2.25 * 2^48 / (2688 * 160/192) / 876 MHz = 322.75s。这个估计合理吗？
我的内核的测量性能是523s。这确实意味着整数吞吐量仅约为 160 * 3.28 (measure IPC) / 5 (max IPC)。

ILP测试代码

__device__ int x[10];
__global__ void test(int flag = 0)
{
    int a = x[0], b = x[1], c = x[2], d = x[3];
    int _a = x[4], _b = x[5], _c = x[6], _d = x[7];

#pragma unroll 128
    for (int i = 0; i < 51200; ++i)
    {
        asm volatile("add.u32 %0, %0, %1;": "+r"(a): "r"(_a));
        asm volatile("add.u32 %0, %0, %1;": "+r"(b): "r"(_b));
        asm volatile("add.u32 %0, %0, %1;": "+r"(c): "r"(_c));
        asm volatile("add.u32 %0, %0, %1;": "+r"(d): "r"(_d));
    }

    int v = a + b + c + d;
    if (flag * v == 1)
        x[0] = v;
}

4 个候选人的代码片段

每个候选人都接受9 / 4 = 2.25 操作。 Cuobjdump 也验证了这一点。

d ^= d2(1, 3);                 // d2 is located in constant memory
s ^= d;
t ^= d2(1, 16);
u ^= d2(1, 17);
v ^= some_const;
flag_s = min(flag_s, s);       // int min has throughput of 160
flag_t = flag_t || (s == t);   // setp.or should be the same
flag_u = flag_u || (s == u);
flag_v = flag_v || (s == v);

【问题讨论】：

也许 ILP 增益适用于多流水线 ALUd 和多流水线 FPUd 内核，例如 fermi（似乎更重）和 vliw5（用于特殊功能单元）。
混合所有 32bitInt 32bitfp 64bitInt 64bitfp 线程 shuffle/bittwiddling 可以提供完全流水线计算，重叠异步内核可以利用 GPU @ %100。
谢谢！我同意混合说明可以充分利用。但是对于只涉及 int ops 并且完全不受内存限制的内核，有没有可以实现（接近）100% 性能的示例？

标签： cuda gpgpu

【解决方案1】：

我正在提供一个答案，以便将此问题从未回答列表中删除。

我没有观察到 executed Instructions Per Count (IPC) 与 Instruction Level Parallelism 的变化。总体而言，除了 OP 自己提供的任何其他信息（例如，启动配置）之外，很难在不知道任何进一步信息的情况下争论 OP 观察到的效果的原因。

在下面的代码中，我正在考虑使用floats 的示例，尽管我已经使用ints 测试了相同的代码，但并未更改概念结果。该代码使用ILP=1、ILP=2 和ILP=4 实现循环Multiply Add (MAD) 操作。

executed IPC 一直如下

ILP         IPC            FLOPs
1           3.924          67108864
2           4.323          67108864
4           4.016          67108864

为N=8192。该代码已使用CUDA 8.0 编译并在NVIDIA GT920M 上运行。可以看出，IPC 对于ILP 的不同考虑值几乎保持不变。 Floating Point Operations (FLOPs) 由代码估计，假设 2 FLOPs per MAD 与 Visual Profiler 测量的一致。

代码

#include<stdio.h>

#define N_ITERATIONS 8192

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCKSIZE   512

//#define DEBUG

/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float a = d_a[tid];
        float b = d_b[tid];
        float c = d_c[tid];

        for (unsigned int i = 0; i < N_ITERATIONS; i++) {
            a = a * b + c;
        }

        d_a[tid] = a;
    }

}

/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N / 2) {

        float a1 = d_a[tid];
        float b1 = d_b[tid];
        float c1 = d_c[tid];

        float a2 = d_a[tid + N / 2];
        float b2 = d_b[tid + N / 2];
        float c2 = d_c[tid + N / 2];

        for (unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
        }

        d_a[tid] = a1;
        d_a[tid + N / 2] = a2;
    }

}

/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N / 4) {

        float a1 = d_a[tid];
        float b1 = d_b[tid];
        float c1 = d_c[tid];

        float a2 = d_a[tid + N / 4];
        float b2 = d_b[tid + N / 4];
        float c2 = d_c[tid + N / 4];

        float a3 = d_a[tid + N / 2];
        float b3 = d_b[tid + N / 2];
        float c3 = d_c[tid + N / 2];

        float a4 = d_a[tid + 3 * N / 4];
        float b4 = d_b[tid + 3 * N / 4];
        float c4 = d_c[tid + 3 * N / 4];

        for (unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
            a3 = a3 * b3 + c3;
            a4 = a4 * b4 + c4;
        }

        d_a[tid] = a1;
        d_a[tid + N / 4] = a2;
        d_a[tid + N / 2] = a3;
        d_a[tid + 3 * N / 4] = a4;
    }

}

/********/
/* MAIN */
/********/
int main() {

    //const int N = 8192 * 64;
    const int N = 8192;
    //const int N = 1024;

    TimingGPU timerGPU;

    float *h_a = (float*)malloc(N*sizeof(float));
    float *h_a_result_host = (float*)malloc(N*sizeof(float));
    float *h_a_result_device = (float*)malloc(N*sizeof(float));
    float *h_b = (float*)malloc(N*sizeof(float));
    float *h_c = (float*)malloc(N*sizeof(float));

    for (int i = 0; i<N; i++) {
        h_a[i] = 2.;
        h_b[i] = 1.;
        h_c[i] = 2.;
        h_a_result_host[i] = h_a[i];
        for (unsigned int k = 0; k < N_ITERATIONS; k++) {
            h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
        }
    }

    float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(float)));
    float *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(float)));
    float *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(float), cudaMemcpyHostToDevice));

    /***********/
    /* KERNEL0 */
    /***********/
    timerGPU.StartCounter();
    kernel0 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    // --- Remember: timing is in ms
    printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
    for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }

    /***********/
    /* KERNEL1 */
    /***********/
    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
    timerGPU.StartCounter();
    kernel1 << <iDivUp(N / 2, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    // --- Remember: timing is in ms
    printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
    for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }

    /***********/
    /* KERNEL2 */
    /***********/
    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
    timerGPU.StartCounter();
    kernel2 << <iDivUp(N / 4, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    // --- Remember: timing is in ms
    printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
    for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }

    cudaDeviceReset();

    return 0;

}

【讨论】：