GPU缩减代码只运行一次答案

【问题标题】：GPU reduction code only runs one timeGPU缩减代码只运行一次
【发布时间】：2016-06-28 09:59:24
【问题描述】：

我一直在使用 Robert Crovella 提供的代码示例：

thrust::max_element slow in comparison cublasIsamax - More efficient implementation?

这是一个非常快速的缩减代码。我对其进行了修改，以返回浮点输入数组中最大值的索引。当我在我的代码中使用它时，它只会执行一次。如果我再次尝试调用该例程，它不会找到新的最大值，它只会返回前一个最大值。例程使用的易失性全局内存是否需要重置才能再次调用？

#include <cuda.h>  
#include <cublas_v2.h>  
#include <thrust/extrema.h>  
#include <thrust/device_ptr.h>  
#include <thrust/device_vector.h>  
#include <stdio.h>  
#include <stdlib.h>  

#define DSIZE 4096*4  // nTPB should be a power-of-2  
#define nTPB 512  
#define MAX_KERNEL_BLOCKS 30  
#define MAX_BLOCKS ((DSIZE/nTPB)+1)  
#define MIN(a,b) ((a>b)?b:a)  
#define FLOAT_MIN -1.0f  

#include <helper_functions.h>  
#include <helper_cuda.h>  

// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];  
__device__ volatile int   blk_idxs[MAX_BLOCKS];  
__device__ int   blk_num = 0;  

//template <typename T>  
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){  

  __shared__ volatile float   vals[nTPB];  
  __shared__ volatile int idxs[nTPB];  
  __shared__ volatile int last_block;  
  int idx = threadIdx.x+blockDim.x*blockIdx.x;  
  last_block = 0;  
  float   my_val = FLOAT_MIN;  
  int my_idx = -1;  
  // sweep from global memory  
  while (idx < dsize){  
    if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}  
    idx += blockDim.x*gridDim.x;}  
  // populate shared memory  
  vals[threadIdx.x] = my_val;  
  idxs[threadIdx.x] = my_idx;  
  __syncthreads();  
  // sweep in shared memory  
  for (int i = (nTPB>>1); i > 0; i>>=1){  
    if (threadIdx.x < i)  
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }  
    __syncthreads();}  
  // perform block-level reduction  
  if (!threadIdx.x){  
    blk_vals[blockIdx.x] = vals[0];  
    blk_idxs[blockIdx.x] = idxs[0];  
    if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block  
      last_block = 1;}  
  __syncthreads();  
  if (last_block){  
    idx = threadIdx.x;  
    my_val = FLOAT_MIN;  
    my_idx = -1;  
    while (idx < gridDim.x){  
      if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }  
      idx += blockDim.x;}  
  // populate shared memory  
    vals[threadIdx.x] = my_val;  
    idxs[threadIdx.x] = my_idx;  
    __syncthreads();  
  // sweep in shared memory  
    for (int i = (nTPB>>1); i > 0; i>>=1){  
      if (threadIdx.x < i)  
        if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }  
      __syncthreads();}  
    if (!threadIdx.x)  
      *result = idxs[0];  
    }  
}  



int main(){  

  int nrElements = DSIZE;  
  float *d_vector, *h_vector;  

  StopWatchInterface *hTimer = NULL;  
  sdkCreateTimer(&hTimer);  
  double gpuTime;  
  int k;  
  int max_index;  
  int *d_max_index;  
  cudaMalloc(&d_max_index, sizeof(int));  


  h_vector = new float[DSIZE];  
  for(k=0; k < 5; k++){  
    for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;  
       h_vector[10+k] = 10;  // create definite max element that changes with each loop iteration   
   cublasHandle_t my_handle;  
   cublasStatus_t my_status = cublasCreate(&my_handle);  
   cudaMalloc(&d_vector, DSIZE*sizeof(float));  
   cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);  

       max_index = 0;  
       sdkResetTimer(&hTimer);  
       sdkStartTimer(&hTimer);  
       //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.  
       thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);  
       thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);  
       max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;  
       cudaDeviceSynchronize();  
       gpuTime = sdkGetTimerValue(&hTimer);  
       std::cout << "loop: " << k << "  thrust time:   " << gpuTime << " max index: " << max_index << std::endl;  

       max_index = 0;  
       sdkResetTimer(&hTimer);  
       sdkStartTimer(&hTimer);  
       my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);  
       cudaDeviceSynchronize();  
       gpuTime = sdkGetTimerValue(&hTimer);   
       std::cout << "loop: " << k << "  cublas time:   " << gpuTime << " max index: " << max_index-1 << std::endl;  

       max_index = 0;  
       sdkResetTimer(&hTimer);  
       sdkStartTimer(&hTimer);  
       max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);  
       cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);  
       gpuTime = sdkGetTimerValue(&hTimer);  
       std::cout << "loop: " << k << "  idx kern time: " << gpuTime << " max index: " << max_index << std::endl;  
       std::cout <<  std::endl;  

  } // end for loop on k  

   cudaFree(d_max_index);  
   cudaFree(d_vector);  

  return 0;  
}

【问题讨论】：

您能否提供一个完整的代码示例来重现您的问题？

标签： cuda reduction

【解决方案1】：

按原样重复使用此代码进行多个循环的主要问题在于设备（全局）变量的静态初始化：

__device__ int   blk_num = 0;

如果您只打算运行例程一次，那也没关系。但是如果你打算重复使用它，你需要在每次调用内核之前重新初始化这个变量为零。

我们可以通过在每次调用归约内核之前将此变量的显式初始化设置为零来解决此问题：

cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));

（我在这里使用max_index 只是因为它是一个方便的主机int 变量，刚刚被设置为零。）

这是使代码“工作”所需的唯一更改。

但是，我要指出循环的引入产生了一些其他“问题”。这3行代码：

cublasHandle_t my_handle;  
cublasStatus_t my_status = cublasCreate(&my_handle);  
cudaMalloc(&d_vector, DSIZE*sizeof(float));

不属于 k 的 for 循环内。这实际上会造成内存泄漏并不必要地重新初始化 cublas 库。

以下代码有这些变化，似乎对我有用：

$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>

#define DSIZE 4096*4  // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f

#include <helper_functions.h>
#include <helper_cuda.h>

// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int   blk_idxs[MAX_BLOCKS];
__device__ int   blk_num;

//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){

  __shared__ volatile float   vals[nTPB];
  __shared__ volatile int idxs[nTPB];
  __shared__ volatile int last_block;
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  last_block = 0;
  float   my_val = FLOAT_MIN;
  int my_idx = -1;
  // sweep from global memory
  while (idx < dsize){
    if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
    idx += blockDim.x*gridDim.x;}
  // populate shared memory
  vals[threadIdx.x] = my_val;
  idxs[threadIdx.x] = my_idx;
  __syncthreads();
  // sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1){
    if (threadIdx.x < i)
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
    __syncthreads();}
  // perform block-level reduction
  if (!threadIdx.x){
    blk_vals[blockIdx.x] = vals[0];
    blk_idxs[blockIdx.x] = idxs[0];
    if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
      last_block = 1;}
  __syncthreads();
  if (last_block){
    idx = threadIdx.x;
    my_val = FLOAT_MIN;
    my_idx = -1;
    while (idx < gridDim.x){
      if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
      idx += blockDim.x;}
  // populate shared memory
    vals[threadIdx.x] = my_val;
    idxs[threadIdx.x] = my_idx;
    __syncthreads();
  // sweep in shared memory
    for (int i = (nTPB>>1); i > 0; i>>=1){
      if (threadIdx.x < i)
        if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
      __syncthreads();}
    if (!threadIdx.x)
      *result = idxs[0];
    }
}



int main(){

  int nrElements = DSIZE;
  float *d_vector, *h_vector;

  StopWatchInterface *hTimer = NULL;
  sdkCreateTimer(&hTimer);
  double gpuTime;
  int k;
  int max_index;
  int *d_max_index;
  cudaMalloc(&d_max_index, sizeof(int));


  h_vector = new float[DSIZE];
  cublasHandle_t my_handle;
  cublasStatus_t my_status = cublasCreate(&my_handle);
  cudaMalloc(&d_vector, DSIZE*sizeof(float));
  for(k=0; k < 5; k++){
    for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
       h_vector[10+k] = 10;  // create definite max element that changes with each loop iteration
    cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);

    max_index = 0;
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
       //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
    thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
    thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
    max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
    cudaDeviceSynchronize();
    gpuTime = sdkGetTimerValue(&hTimer);
    std::cout << "loop: " << k << "  thrust time:   " << gpuTime << " max index: " << max_index << std::endl;

    max_index = 0;
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
    cudaDeviceSynchronize();
    gpuTime = sdkGetTimerValue(&hTimer);
    std::cout << "loop: " << k << "  cublas time:   " << gpuTime << " max index: " << max_index-1 << std::endl;

    max_index = 0;
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
    max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
    cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
    gpuTime = sdkGetTimerValue(&hTimer);
    std::cout << "loop: " << k << "  idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
    std::cout <<  std::endl;

  } // end for loop on k

   cudaFree(d_max_index);
   cudaFree(d_vector);

  return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0  thrust time:   2.806 max index: 10
loop: 0  cublas time:   0.441 max index: 10
loop: 0  idx kern time: 0.395 max index: 10

loop: 1  thrust time:   1.298 max index: 11
loop: 1  cublas time:   0.419 max index: 11
loop: 1  idx kern time: 0.424 max index: 11

loop: 2  thrust time:   1.303 max index: 12
loop: 2  cublas time:   0.43 max index: 12
loop: 2  idx kern time: 0.419 max index: 12

loop: 3  thrust time:   1.291 max index: 13
loop: 3  cublas time:   0.423 max index: 13
loop: 3  idx kern time: 0.415 max index: 13

loop: 4  thrust time:   1.299 max index: 14
loop: 4  cublas time:   0.423 max index: 14
loop: 4  idx kern time: 0.417 max index: 14

========= ERROR SUMMARY: 0 errors
$

【讨论】：

调用 cudaMemcpyToSymbol() 是否有开销，因为这是从主机到设备的副本？当我们到达内核代码的 if(last_blk){ } 部分时，将 blk_num 重置为零会更好吗？
是的，您也应该能够做到这一点。在这种情况下，您需要重新添加我在上述代码中删除的 blk_num (=0;) 的静态初始化。
@RobertCrovella 你好罗伯特。感谢您提供此内核。我确实有一个担心。您正在检查atomicAdd(&blk_num, 1) == gridDim.x - 1。在我看来，它应该检查gridDim.x 而不是gridDim.x - 1。例如，如果我有 256 个块，并且每个块都将 1 加到从 0 开始的运行计数，那么在所有块完成后，运行计数应该是 256，这相当于gridDim.x，而不是 255，它将是gridDim.x - 1。如果我弄错了，我将非常感谢您对该行代码提供的任何见解。
没关系，我已经弄清楚了。我阅读了 atomicAdd 的文档并意识到即使它计算 old + val 它返回 old。很抱歉假设事情:)