【问题标题】:Latency of shuffle instructions in CUDACUDA 中 shuffle 指令的延迟
【发布时间】:2013-08-16 23:06:00
【问题描述】:

关于__shfl()指令的延迟:

执行以下指令

c=__shfl(c, indi);

/*
where indi is any integer number(may be random (<32)), 
and is different for different LaneID.
*/

具有相同的延迟,比方说:

c=__shfl_down(c,1);

【问题讨论】:

    标签: cuda gpu nvidia


    【解决方案1】:

    所有 warp-shuffle 指令都有same performance

    【讨论】:

      【解决方案2】:

      为了对 Robert 的回答提供一个“定量”的后续回答,让我们考虑一下 Mark Harris 使用 CUDA shuffle 操作的缩减方法,详情请参阅 Faster Parallel Reductions on Kepler

      在这种方法中,通过使用__shfl_down 来减少扭曲。另一种减少翘曲的方法是根据Lecture 4: warp shuffles, and reduction / scan operations 使用__shfl_xor。下面,我将报告实现这两种方法的完整代码。如果在 Kepler K20c 上进行测试,两者都采用 0.044ms 来减少 N=200000 float 元素的数组。相关地,这两种方法的性能都比 Thrust reduce 高两个数量级,因为对于同一测试,Thrust 案例的执行时间是 1.06ms

      这里是完整的代码:

      #include <thrust\device_vector.h>
      
      #define warpSize 32
      
      /***********************************************/
      /* warpReduceSum PERFORMING REDUCTION PER WARP */
      /***********************************************/
      __forceinline__ __device__ float warpReduceSum(float val) {
      
          for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
          //for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);    
          return val;
      
      }
      
      /*************************************************/
      /* blockReduceSum PERFORMING REDUCTION PER BLOCK */
      /*************************************************/
      __forceinline__ __device__ float blockReduceSum(float val) {
      
          // --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
          //     1024, so that there will be at most 32 warps per each block.
          static __shared__ float shared[32]; 
      
          int lane    = threadIdx.x % warpSize;   // Thread index within the warp
          int wid     = threadIdx.x / warpSize;   // Warp ID
      
          // --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
          val = warpReduceSum(val);     
      
          // --- Only the threads with 0 index within the warp write the warp result to shared memory
          if (lane==0) shared[wid]=val;   // Write reduced value to shared memory
      
          // --- Wait for all warp reductions
          __syncthreads();              
      
          // --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only 
          //     the corresponding warp existed, otherwise the partial sum is set to zero.
          val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
      
          // --- The first warp performs the final partial warp summation. 
          if (wid==0) val = warpReduceSum(val); 
      
          return val;
      }
      
      /********************/
      /* REDUCTION KERNEL */
      /********************/
      __global__ void deviceReduceKernel(float *in, float* out, int N) {
      
          float sum = 0.f;
      
          // --- Reduce multiple elements per thread.
          for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];
      
          sum = blockReduceSum(sum);
      
          if (threadIdx.x==0) out[blockIdx.x]=sum;
      }
      
      /********/
      /* MAIN */
      /********/
      void main() {
      
          const int N = 200000;
      
          thrust::host_vector<float> h_out(N,0.f);
      
          thrust::device_vector<float> d_in(N,3.f);
          thrust::device_vector<float> d_out(N);
      
          int threads = 512;
          int blocks = min((N + threads - 1) / threads, 1024);
      
          float time;
          cudaEvent_t start, stop;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
      
          // --- Performs the block reduction. It returns an output vector containig the block reductions as elements
          cudaEventRecord(start, 0);
          deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
          // --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
          //     block reduction results of the previous step.
          deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&time, start, stop);
          printf("CUDA Shuffle - elapsed time:  %3.5f ms \n", time);      
          h_out = d_out;
      
          cudaEventRecord(start, 0);
          float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&time, start, stop);
          printf("CUDA Thrust - elapsed time:  %3.5f ms \n", time);       
      
          printf("Shuffle result = %f\n",h_out[0]);
          printf("Thrust result = %f\n",sum);
      
          getchar();
      
      }
      

      【讨论】:

        猜你喜欢
        • 2020-02-22
        • 2018-09-06
        • 1970-01-01
        • 2020-01-01
        • 2013-08-29
        • 1970-01-01
        • 2012-08-10
        • 2015-10-19
        • 1970-01-01
        相关资源
        最近更新 更多