【发布时间】:2014-04-17 12:22:02
【问题描述】:
当我在 SO 上遇到 this question 时,我很想知道答案。所以我写了下面一段代码来测试不同场景下的原子操作性能。操作系统是带有 CUDA 5.5 的 Ubuntu 12.04,设备是 GeForce GTX780(Kepler 架构)。我使用 -O3 标志和 CC=3.5 编译了代码。
#include <stdio.h>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define BLOCK_SIZE 256
#define RESTRICTION_SIZE 32
__global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+i, 6); //arbitrary number to add
}
}
__global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6); //arbitrary number to add
}
}
__global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+(i>>5), 6); //arbitrary number to add
}
}
__global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data, 6); //arbitrary number to add
}
}
__global__ void CoalescedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+threadIdx.x, data[i]);
}
}
__global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]);
}
}
__global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]);
}
}
__global__ void SameAddressAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data, data[0]);
}
}
int main(void)
{
const int n = 2 << 24;
int* data = new int[n];
int i;
for(i=0; i<n; i++) {
data[i] = i%1024+1;
}
int* dev_data;
HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) );
HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) );
HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
HANDLE_ERROR( cudaDeviceReset() );
printf("Program finished without error.\n");
return 0;
}
基本上在上面的代码中有 8 个内核,其中所有线程对所有数据执行atomicAdd。
- 在全局内存上合并原子添加。
- 全局内存中受限地址空间的原子添加。 (代码中为 32)
- 全局内存中同一地址上的扭曲通道的原子添加。
- 全局内存中同一地址上的所有线程的原子添加。
通过将上述项目中的 global 替换为 shared 可以找到项目 5 到 8。选择的块大小为 256。
我使用nvprof 来分析程序。输出是:
Time(%) Time Calls Avg Min Max Name
44.33% 2.35113s 50 47.023ms 46.987ms 47.062ms SameAddressAtomicOnSharedMem(int*, int)
31.89% 1.69104s 50 33.821ms 33.818ms 33.826ms SameAddressAtomicOnGlobalMem(int*, int)
10.10% 535.88ms 50 10.718ms 10.707ms 10.738ms WarpRestrictedAtomicOnSharedMem(int*, int)
3.96% 209.95ms 50 4.1990ms 4.1895ms 4.2103ms AddressRestrictedAtomicOnSharedMem(int*, int)
3.95% 209.47ms 50 4.1895ms 4.1893ms 4.1900ms AddressRestrictedAtomicOnGlobalMem(int*, int)
3.33% 176.48ms 50 3.5296ms 3.5050ms 3.5498ms WarpRestrictedAtomicOnGlobalMem(int*, int)
1.08% 57.428ms 50 1.1486ms 1.1460ms 1.1510ms CoalescedAtomicOnGlobalMem(int*, int)
0.84% 44.784ms 50 895.68us 888.65us 905.77us CoalescedAtomicOnSharedMem(int*, int)
0.51% 26.805ms 1 26.805ms 26.805ms 26.805ms [CUDA memcpy HtoD]
0.01% 543.61us 1 543.61us 543.61us 543.61us [CUDA memset]
显然,合并的无冲突原子操作具有最佳性能,而相同地址的性能最差。我无法解释的一件事是,为什么共享内存(块内)上的相同原子地址与全局内存(所有线程之间通用)相比要慢。
当所有 warps 通道访问 共享内存 中的同一位置时,性能非常糟糕,但(令人惊讶的是)它们在 全局内存 上执行时并非如此。我无法解释为什么。另一个混淆情况是全局地址受限原子的性能比扭曲内的所有线程在同一地址上执行它时的性能更差,而第一种情况下的内存争用似乎较低。
无论如何,如果有人能解释上述分析结果,我会很高兴。
【问题讨论】:
-
为什么要在 SameAddressAtomicOnSharedMem 中添加数据 [0],而不是像在 SameAddressAtomicOnGlobalMem 中那样添加直接值?它会导致一次额外的全局读取。不能保证它在缓存中的存在。我认为所有内核的共享版本与全局版本都是这种情况。我不认为我理解其背后的原因。
-
在比较所有情况下的共享记忆和全局记忆时,我希望尽可能公平。虽然全局内存中的
atomicAdd涉及受保护的读取-修改-写入,但我希望共享内存版本能够读取。即使我们用即时文字替换全局读取,结果也几乎相同。例如SameAddressAtomicOnSharedMem平均只减少了 2.5 毫秒。 -
很公平。后续问:我们怎么知道立即添加没有优化?你可能会说; “即使两者都是立即添加,全局仍然表现更好”。但是,假设全局添加可能比共享添加更积极的优化是否过于牵强?只是头脑风暴..
标签: cuda gpu gpgpu nvidia atomic