如何合并大小与块大小相同的块中的全局读取？答案

【问题标题】：How do I coalesce global reads in chunks with size the same as block size?如何合并大小与块大小相同的块中的全局读取？
【发布时间】：2021-08-09 01:39:10
【问题描述】：

我在全局内存中有一个很长的 float2 数组，我的内核的每个块从中读取每个块中的 512 个连续值（但是对于任何给定块访问的特定块没有可识别的模式）。所以我在想我应该能够从全局内存中合并那些 512 个 float2 负载。为了做到这一点，我将指向 512 中第一个元素的指针加载到共享内存中，然后让每个线程将指针增加其 threadID 以从该地址读取 float2。但是，当我检查 Nsight Compute 时，这似乎并没有合并读取。我已经标记了下面的行，它说我有未合并的全局访问。

除了顺序访问之外，最佳实践指南还提到了对齐要求，但我使用的是矢量类型，指南说它是自动对齐的（除了它们对应于每个值 8 个字节之外）。所以我不确定为什么我的阅读没有合并。顶部读取的实际全局 L2 扇区与理想全局 L2 扇区的比率约为 1.12，底部读取为 1.09，我认为这还不错，但我仍然希望尽可能优化这些读取。

我上面没有提到的代码中的一个小问题是，这 512 次读取中的每一次都在循环中完成 nsegs 次，但我不确定这是否会影响读取模式。另一个奇怪的事情是，在我真正的完整代码中，Nsight 说我在第二次读取中已经取消合并读取（我通过 threadID 增加指针并读取该地址），但没有将从全局到共享内存的读取标记为未合并。与此不同的是，在下面的示例代码中，Nsight 将它们都标记为未合并读取。此外，我的完整代码使用 __shfl_down_sync warpReduce 方法（由 blockReduceSum 调用），但我在下面进行原子求和以缩短示例代码。

#include <stdio.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#define nbins 512
#define nsegs 340
#define ntemplates 100

__device__ float2 * d;

__global__ void kernel(float * d_all_sums, int * d_template_indices) {

    float final = 0.0;

    // blockIdx.x gives template index, threadIdx.x normally gets bin index but use it to mean segment index for loading global memory to shared.

    __shared__ float2 * power_at_0_pointers[nsegs];

    if (threadIdx.x < nsegs)
        power_at_0_pointers[threadIdx.x] = &d[ __ldg(&d_template_indices[blockIdx.x * nsegs + threadIdx.x]) ];  // Uncoalesced here. Real to ideal ratio ~ 1.12.

    __syncthreads();

    __shared__ float power_sum;

    for (int i = 0; i < nsegs; i++) {
        __shared__ float powers[nbins];

        float2 * pow_first_bin = power_at_0_pointers[i];
        float2 input_power_c = *(pow_first_bin + threadIdx.x);       // Uncoalesced here. Real to ideal ratio ~ 1.09.
        
        float power = input_power_c.x * input_power_c.x + input_power_c.y * input_power_c.y;

        power = (2 * power - 1.0) / 2.0;
        powers[threadIdx.x] = power;

        atomicAdd(&power_sum, powers[threadIdx.x]);

        final += power / power_sum;
    }

    if (threadIdx.x == 0)
        d_all_sums[blockIdx.x] = final;
}

int random(int min, int max){
   return min + rand() / (RAND_MAX / (max - min + 1) + 1);
}

int main(){

    float2 * h;
    float * h_all_sums;
    float2 * d_ptr;
    float * d_all_sums;
    int * h_template_indices, * d_template_indices;

    h_template_indices = (int *) malloc(ntemplates * nsegs * sizeof(int));
    h = (float2 *) malloc(nbins * nsegs * ntemplates * sizeof(float2));
    h_all_sums = (float *) malloc(ntemplates * sizeof(float));
    memset(h_all_sums, 0.0, ntemplates);

    for (int k = 0; k < ntemplates; k++) {
        for (int i = 0; i < nsegs; i++) {
            h_template_indices[k * nsegs + i] = random(0, ntemplates * nsegs - nbins);
            for (int j = 0; j < nbins; j++)
                h[k * nbins * nsegs + i * nbins + j] = make_float2(100 * (float) rand() / (float)(RAND_MAX), 100 * (float) rand() / (float)(RAND_MAX));
        }
    }

    gpuErrchk( cudaMalloc((void**) &d_ptr, nbins * nsegs * ntemplates * sizeof(float2)) );
    gpuErrchk( cudaMemcpy(d_ptr, h, nbins * nsegs * ntemplates * sizeof(float2), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpyToSymbol(d, &d_ptr, sizeof(float2*)) );

    gpuErrchk( cudaMalloc( (void**) &d_all_sums, ntemplates * sizeof(float) ) );

    gpuErrchk( cudaMalloc((void**) &d_template_indices, nsegs * ntemplates * sizeof(int)) );
    gpuErrchk( cudaMemcpy(d_template_indices, h_template_indices, nsegs * ntemplates * sizeof(int), cudaMemcpyHostToDevice) );

    kernel<<<ntemplates, nbins>>>(d_all_sums, d_template_indices);

    gpuErrchk( cudaMemcpy(h_all_sums, d_all_sums, ntemplates * sizeof(float), cudaMemcpyDeviceToHost) );

    FILE *f = fopen("test_output.txt", "w");
    if (f != NULL) {
        for (int k = 0; k < ntemplates; k++)
            fprintf(f, "k = %d; power = %f.\n", k, h_all_sums[k]);
    }

    fclose( f );

    gpuErrchk( cudaFree(d_ptr) );
    gpuErrchk( cudaFree(d_all_sums) );
    gpuErrchk( cudaFree(d_template_indices) );

    free( h );
    free( h_all_sums );
    free( h_template_indices );

    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    printf("All done.\n");
}

编辑：我在下面包含了 Nsight Compute 关于合并的结果。我还包含了与我的完整代码更接近的代码，就如何完成块减少（随机播放而不是原子）而言。随着这一变化，现在 Nsight 说我只能在一个站点进行未合并的访问（图书馆中的另一个站点，但这显然超出了我的控制范围）。由于某种原因，求和/归约方法的差异似乎使从全局到共享内存的读取合并？

#include <stdio.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__inline__ __device__ float warpReduceSum(float val) {
  for (int offset = warpSize/2; offset > 0; offset /= 2)
    val += __shfl_down_sync(0xffffffff, val, offset);
  return val;
}

__inline__ __device__ float blockReduceSum(float val) {
  static __shared__ float shared[32];
  int lane = threadIdx.x % warpSize;
  int wid = threadIdx.x / warpSize;
  val = warpReduceSum(val);
  if (lane == 0) shared[wid] = val;
  __syncthreads();
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : float(0.0);
  if (wid == 0) val = warpReduceSum(val);
  return val;
}

#define nbins 512
#define nsegs 340
#define ntemplates 100

__device__ float2 * d;

__global__ void kernel(float * d_all_sums, int * d_template_indices) {

    float final = 0.0;

    // blockIdx.x gives template index, threadIdx.x normally gets bin index but use it to mean segment index for loading global memory to shared.

    __shared__ float2 * power_at_0_pointers[nsegs];

    if (threadIdx.x < nsegs)
        power_at_0_pointers[threadIdx.x] = &d[ __ldg(&d_template_indices[blockIdx.x * nsegs + threadIdx.x]) ];

    __syncthreads();

    for (int i = 0; i < nsegs; i++) {
        __shared__ float powers[nbins];

        float2 * pow_first_bin = power_at_0_pointers[i];
        float2 input_power_c = *(pow_first_bin + threadIdx.x);       // Uncoalesced here.
        
        float power = input_power_c.x * input_power_c.x + input_power_c.y * input_power_c.y;

        power = (2 * power - 1.0) / 2.0;
        powers[threadIdx.x] = 1.0;

        float power_sum = blockReduceSum(powers[threadIdx.x]);

        final += power / power_sum;
    }

    if (threadIdx.x == 0)
        d_all_sums[blockIdx.x] = final;
}

int random(int min, int max){
   return min + rand() / (RAND_MAX / (max - min + 1) + 1);
}

int main(){

    float2 * h;
    float * h_all_sums;
    float2 * d_ptr;
    float * d_all_sums;
    int * h_template_indices, * d_template_indices;

    h_template_indices = (int *) malloc(ntemplates * nsegs * sizeof(int));
    h = (float2 *) malloc(nbins * nsegs * ntemplates * sizeof(float2));
    h_all_sums = (float *) malloc(ntemplates * sizeof(float));
    memset(h_all_sums, 0.0, ntemplates);

    for (int k = 0; k < ntemplates; k++) {
        for (int i = 0; i < nsegs; i++) {
            h_template_indices[k * nsegs + i] = random(0, ntemplates * nsegs - nbins);
            for (int j = 0; j < nbins; j++)
                h[k * nbins * nsegs + i * nbins + j] = make_float2(100 * (float) rand() / (float)(RAND_MAX), 100 * (float) rand() / (float)(RAND_MAX));
        }
    }

    gpuErrchk( cudaMalloc((void**) &d_ptr, nbins * nsegs * ntemplates * sizeof(float2)) );
    gpuErrchk( cudaMemcpy(d_ptr, h, nbins * nsegs * ntemplates * sizeof(float2), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpyToSymbol(d, &d_ptr, sizeof(float2*)) );

    gpuErrchk( cudaMalloc( (void**) &d_all_sums, ntemplates * sizeof(float) ) );

    gpuErrchk( cudaMalloc((void**) &d_template_indices, nsegs * ntemplates * sizeof(int)) );
    gpuErrchk( cudaMemcpy(d_template_indices, h_template_indices, nsegs * ntemplates * sizeof(int), cudaMemcpyHostToDevice) );

    kernel<<<ntemplates, nbins>>>(d_all_sums, d_template_indices);

    gpuErrchk( cudaMemcpy(h_all_sums, d_all_sums, ntemplates * sizeof(float), cudaMemcpyDeviceToHost) );

    FILE *f = fopen("test_output.txt", "w");
    if (f != NULL) {
        for (int k = 0; k < ntemplates; k++)
            fprintf(f, "k = %d; power = %f.\n", k, h_all_sums[k]);
    }

    fclose( f );

    gpuErrchk( cudaFree(d_ptr) );
    gpuErrchk( cudaFree(d_all_sums) );
    gpuErrchk( cudaFree(d_template_indices) );

    free( h );
    free( h_all_sums );
    free( h_template_indices );

    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    printf("All done.\n");
}

EDIT2：按要求添加内存图表和表格。

【问题讨论】：

能否附上 nsight 内存图表（可能还有表格）的屏幕截图，好吗？那么我们可以更好地说，如果有真正的优化潜力。 1.09 和 1.12 已经相当理想了。
确定索引后（在代码中随机），您可以将 512 个 float2 段复制到与 DRAM 段对齐的单独位置。那么您的效率将为 1，但对于最初的重叠，现在单独的 segs 您的缓存效率会更差。
谢谢！我在上面添加了内存图表和表格。
总的来说，您的内存性能非常好（瓶颈在其他地方）。几乎所有访问都由 L1 和 L2 缓存提供服务。由于未对齐而导致的 546200 次全局内存访问（循环中的指令中的 544000 次）被“吹”成 650501 个波前，并且它们大部分（> 70% 的 32 字节宽扇区是 L1 缓存未命中）被转发到 L2 缓存. L2 缓存线宽 128 字节，访问 (32 * float2) 256 字节。您最终会收到 1173730 个 L2 请求，几乎所有请求都得到了服务。加上几家商店，L2 缓存占用率为 12.84%。
内存性能不是问题。您可以优化其他瓶颈以将性能提高近 8 倍。之后，L2 缓存将受到限制。为了进一步优化，您应该尝试为 L1 缓存获得更好的命中率或使用共享内存进行数据重用。这将限于 85 倍的性能提升（您的 SM 多处理器的加载/存储管道占用 1.16%）。只有这样，您才能通过尝试优化单个内存指令（对齐、使用 float4 等）获得一些东西。但在此之前，您会遇到 GPU 的算术峰值性能。

标签： cuda

【解决方案1】：

所以我不确定为什么我的读取没有合并。

我相信您将两种不同的东西混为一谈。您实际上并没有展示您用来做出判断的 nsight 计算输出，但我相信您将合并与效率混为一谈。本声明：

实际全局 L2 扇区与理想全局 L2 扇区的比率在顶部读取时约为 1.12，在底部读取时约为 1.09，

真的是关于效率。

让我们回顾一些定义：

coalescing - 这是指根据请求所指地址的位置将请求组合在一起的能力。当一个 warp 中每个线程的请求地址可以组合在一起时，GPU 内存控制器将合并这些请求以创建一些较少数量的事务，以服务这些请求
效率 - 对于这里的用法，我们指的是为这些请求提供服务的字节必要，除以为这些请求提供服务的实际检索到的字节 p>

降低效率的一种方法是减少合并。但是，还有其他方法可以降低效率，而您的代码正在执行其中一种方法，就对齐而言。

当内存控制器需要数据时（比如说来自 DRAM，但缓存也有细粒度的细分），不可能请求任意数量的字节。 DRAM 内存（或缓存）以称为 segments 的组（缓存：lines 或 sectors）的形式提供数据。内存控制器将从 DRAM 内存中请求一个或多个段，以满足特定读取请求的需要。

当我们有 unaligned 读取时会发生什么？这是一个图形示例：

segments:      |  1  |  2  |  3  |  4  |  5  | ...
aligned read:  ^^^^^^^^^^^^^^^^^^^^^^^^

在对齐读取的情况下，我们看到它既是 coalesced 又是 aligned。这意味着所需的所有数据都彼此相邻（因此可以很好地合并），并且请求的数据区域的起点和终点与段边界对齐。因此，内存控制器将检索 4 个段来为上述对齐的读取请求提供服务，而这正是所需要的（以字节计），因此效率为 100% 或 1.0 比率。

segments:      |  1  |  2  |  3  |  4  |  5  | ...
unaligned read:  ^^^^^^^^^^^^^^^^^^^^^^^^

在未对齐读取的情况下，请求的字节数与对齐读取相同，但现在由于我们不与段边界对齐，内存控制器必须检索段 1-5（总共 5段而不是 4) 来满足读取请求的需求。这仍然是“高度”合并的——只有一个地址组，所有请求的字节都彼此相邻。但是效率降低了，因为现在内存控制器必须检索 5 个段，而不是 4 个，以提供与 warp 请求请求的相同数量的字节。此处实际与必要的比率为 5/4 或 1.25。

请注意，上述讨论没有考虑缓存中可能已经存在的内容，分析器在进行此类测量时也没有考虑。上面的讨论同样适用于缓存行为/测量，但是，只需将“行”或“扇区”替换为“段”。

您的代码行：

for (int i = 0; i < nsegs; i++) {
    __shared__ float powers[nbins];

    float2 * pow_first_bin = power_at_0_pointers[i];
    float2 input_power_c = *(pow_first_bin + threadIdx.x);       // Uncoalesced here

在分析器测量时正在经历这种“效率损失”。从合并的角度来看，我会说该代码“很好地”合并，因为索引构造中的+ threadIdx.x 几乎可以保证：相邻线程（在 x 中）将从内存中的相邻位置拉出。这就是“理想”合并的秘诀。但是，for 循环将指针偏移了一个索引值i，这意味着随着循环的进行，读取的“模式”（即合并的地址组）将“遍历”L2 扇区（或内存段）的模式.除了i 索引偏移量实际上与扇区/段边界对齐的位置之外，在特定i 偏移量处的读取请求将需要二级缓存中的额外扇区。

这是个问题吗？

我通常会说不，有几个原因：

这是一种常见的模式，难以重构。
仍然存在我所说的“近乎理想”的合并。
对效率的担忧大多忽略了实际的缓存行为。

让我们更深入地研究上面的第 3 项。

缓存的一般目的是减少与进入内存层次结构中的下一个级别以检索数据相关的惩罚。对于 L2 高速缓存，内存层次结构中的下一级是 DRAM 内存（一般来说）。对于您拥有的“行走”模式，该模式将很快用所需数据填充 L2 缓存，因此i 中的后续迭代通常不必从 DRAM 内存中检索太多（如果有的话），以便在该循环进行时为其提供服务。因此，L2 缓存正在发挥作用。随着它的进行，它主要是将该循环与 DRAM 提取的成本隔离开来。随着循环的进行，您有时必须从 L2 中检索额外的扇区这一事实远不如每次循环迭代都必须进入 DRAM 以获取所有数据的情况那么紧迫。

除了顺序访问，最佳实践指南提到了对齐要求，但我使用的是矢量类型，指南说它是自动对齐的。

当我们指的对齐是扇区/段对齐时，向量类型不会自动对齐。希望从前面的讨论中可以明显看出这一点。

为了使向量类型有用，它们必须是naturally aligned。这意味着元素本身的地址必须与内存边界对齐，该内存边界是向量类型字节大小的整数倍。由 8 个字节组成的向量类型必须位于地址 0、8、16、24、32 或 40 等处。这与此处和其他地方讨论的对齐类型不同。

我的观点是，如果您只关注分析器报告的效率损失行，您将难以“优化”此代码。在我看来，问为什么那里会出现效率损失（比如我在这里试图描述的）并不是优化这个特定代码的一个非常有效的途径。相反，我会尝试在你正在尝试做的事情的上下文中重构整个代码，但我都不清楚。

在不知道该代码的更高级别目的的情况下，我将关注的两个领域是：

寻求摆脱原子使用
只要有数据重用，看看共享内存的使用是否能带来好处。

摆脱原子使用通常需要大量重构算法。现在，代码专注于从输入角度处理数据。摆脱原子的一种可能方法是专注于询问“每个位置的最终结果是什么”，然后编写产生该结果的代码，可能每个输出一个线程或每个输出点一个扭曲或块。通过这种方式重组代码，您通常可以摆脱原子使用。另一种选择是查看canonical parallel reduction methods 以摆脱原子。

存在跨线程数据重用的共享内存重构是最常见的 CUDA 教程主题之一。您会找到其中的many treatments。

为了有条不紊地介绍我在这里讨论的一些概念，您不妨回顾一下（至少前 4 节）this CUDA tutorial series。

【讨论】：

这很有启发性。谢谢！我在上面的编辑中包含了来自 Nsight 的屏幕截图。我的实际代码不使用原子（我添加了一个使用归约的版本，与我的主代码相同）。我会仔细考虑我的代码，以寻找利用共享内存的方法，就像你的观点 (2) 建议的那样。
但第一个屏幕截图确实明确表示“Uncoalesced global accesses: [Warning] Uncoalesced global access, ...”。这不准确吗？ nsight 是否更宽松地表示“Uncoalesced global access”？正如您所指出的，我知道理想的扇区与负载扇区是关于效率的。我只是对第一个屏幕截图上的消息感到困惑。