CUDA内核比CPU慢答案

【问题标题】：CUDA kernel is slower than CPUCUDA内核比CPU慢
【发布时间】：2012-05-27 19:24:19
【问题描述】：

我是 CUDA 的新手，我可能做错了什么。我只需要对两个二进制向量进行逻辑运算。向量的长度为 2048000。我比较了 Matlab 的 C mex 文件和 CUDA 内核中的逻辑 and 之间的速度。 CPU 上的 C 比 CUDA 快约 5%。请注意，我只测量了内核执行（没有内存传输）。我有 i7 930 和 9800GT。

##MEX file testCPU.c:##

#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    
    int i, varLen;
    unsigned char *vars, *output;
            
    vars = mxGetPr(prhs[0]);
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = mxGetPr(plhs[0]);
    for (i=0;i<2048000;i++){
        output[i] = vars[i] & vars[2048000+i];
    }
}

编译

mex testCPU.c

创建向量

vars = ~~(randi(2,2048000,2)-1);

测量速度：

tic;testCPU(vars);toc;

CUDA：

#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"

__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx] = in[idx] && in[idx+N];
}


void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {
    
    int i;
    unsigned char *vars, *output, *gpu, *gpures;
    
    vars = (unsigned char*)mxGetData(prhs[0]);
    
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = (unsigned char*)mxGetData(plhs[0]);       
       
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float dt_ms;
    
    // input GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU input malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    // output GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU output malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    // copy from CPU to GPU
    cudaEventRecord(start, 0);
    cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy input from CPU to GPU: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    dim3 dimBlock(32);
    printf("thread count: %i\n", dimBlock.x);
    dim3 dimGrid(2048000/dimBlock.x);
    printf("block count: %i\n", dimGrid.x);
    
    // --- KERNEL ---
    cudaEventRecord(start, 0);
    logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU kernel: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    // result from GPU to CPU
    cudaEventRecord(start, 0);
    cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy output from GPU to CPU: %f ms, %i\n", dt_ms, cudaGetLastError());
    
    
    cudaFree(gpu);
    cudaFree(gpures);
    
}

编译：

 nvmex -f nvmexopts_9.bat testGPU.cu 
-I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include" 
-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\lib\x64" -lcudart -lcufft

输出：

GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0

该代码可以吗？ CPU 比 CUDA 内核快约 0.1 毫秒。我尝试了高达 512 的不同线程数（32 的乘数），32 是最快的。运算符 & 而不是 && 几乎慢了 1 毫秒。

9800GT真的那么弱吗？使用今天的主流显卡（即 GTX460,560），我可以期待什么加速？

谢谢

编辑：根据 talonmies 的评论，我做了以下修改：

内核函数：

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

主要功能：

uchar4 *gpu, *gpures;

// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);

输出：

GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0

正确吗？几乎 30 倍的加速！这似乎好得令人难以置信，但结果是正确的:) GTX560 在这项特定任务上的速度有多快？谢谢

编辑 2：

这是代码

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

自动转换为：

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;  
    uchar4 buff;

    buff.x = in[idx].x;
    buff.y = in[idx].y;
    buff.z = in[idx].z;
    buff.w = in[idx].w;

    buff.x &= in[idx+N].x;
    buff.y &= in[idx+N].y;
    buff.z &= in[idx+N].z;
    buff.w &= in[idx+N].w;

    out[idx].x = buff.x;
    out[idx].y = buff.y;
    out[idx].z = buff.z;
    out[idx].w = buff.w;
}

通过编译器？

如果它是正确的，它解释了我对合并访问的困惑。我认为in[idx] & in[idx+N] 会导致非合并访问，因为访问的是非连续内存。但实际上，in[idx] 和 in[idx+N] 是在两个合并的步骤中加载的。 N 可以是 16 的任意倍数，因为 uchar4 是 4 字节长，并且对于合并的访问地址必须对齐到 64 字节（在 1.1 设备上）。我说的对吗？

【问题讨论】：

您的代码正在使用字节大小的内存事务，这在计算 1.1 设备上非常不理想。尝试改用 32 位内存事务（例如 uchar4）并让每个线程处理 4 个输入而不是 1 个。
尝试将块大小从 32 更改为至少 256
在修改后的代码中，128 效果最好。 256 几乎相同（慢 1-2%），512 慢约 15%
@eel：如果您打算在 Fermi 或更新的卡上运行它，请注意您的 uchar4 版本。虽然看起来 nvopencc 已将内核编译为使用 32 位加载和存储，但您确实有点“幸运”。您应该将完整的uchar4 输入显式加载到局部变量，然后在局部变量中计算结果并将局部uchar4 结果存储回全局内存。在这种情况下，编译器更有可能生成您想要的代码。但是对于计算 1.1 卡来说，30 倍的内存带宽提升可能是正确的。
@talonmies：这对我来说很重要。请检查edit2

标签： cuda mex

【解决方案1】：

正如talonmies 指出的那样，您正在按字节访问和处理数据，这远非最佳。 Vasily Volkov 在 nVidia 网络研讨会Better Performance at Lower Occupancy 中总结了您可能需要考虑的一系列技术，例如指令级并行和缓冲读/写。

简而言之，您要做的是，在每个线程中，以合并的方式读取几个 uint4，处理它们，然后再存储它们。

更新

如果按照以下方式重新编写代码，会有什么不同吗？

__global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
    int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
    unsigned int buff[chunksize];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] = in[ blockDim.x*k + idx ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] &= in[ blockDim.x*k + idx + N ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        out[ blockDim.x*k + idx ] = buff[k];
}

请注意，我假设 chunksize 是您在某处拥有 #defined 的变量，例如

#define chunksize 4

你必须将你启动的块数和N除以那个数字。我还使用了unsigned int，它只有四个包装uchar。在您的调用函数中，您可能必须相应地转换您的指针。

【讨论】：

我在每个线程中尝试了进程2 uchar4，但速度保持不变
使用 uint4 代替 uchar4 速度慢了约 15%
@eel：您是否以合并的方式访问它们？这是一个重要的细节。
我阅读了很多关于合并内存访问主题的材料，但我仍然不确定我是否理解它。当内核中的索引确定为：idx = blockIdx.x*blockDim.x+threadIdx.x 时，对 array[idx] 的内存访问总是合并（因为 cudaMalloc 与 256 字节对齐）。当我将结果计算为 (array[idx] & array[idx+N]) 时，访问不会合并，因为我正在访问非连续的内存区域。我认为我的问题是第二个向量位于第一个向量之后。它们应该是“交错的”（以合并的方式访问）。我对吗？我怎样才能做到这一点？谢谢
@eel：您能否发布最新版本的代码作为对您问题的更新，然后我可以看一下？

【解决方案2】：

我认为它的发生称为false sharing。我认为问题在于您尝试从线程写入的字节大小的区域正在产生大量的竞争条件，因为不同的线程正在尝试写入相同的字对齐地址。我不确定 GPU 中的细节，但在 CPU 中，当不同的线程尝试在同一个 256 字节对齐区域（称为缓存线）中写入内存时，它们会不断地相互阻塞，从而降低全局性能。

【讨论】：