改进的 reduce0 CUDA 缩减代码不起作用答案

【问题标题】：Improved reduce0 CUDA reduction code not working改进的 reduce0 CUDA 缩减代码不起作用
【发布时间】：2013-02-17 01:41:37
【问题描述】：

我编写了一个 CUDA 代码，它基本上为我汇总了一个数组。数组大小N应该是2的幂，即2^x。但是，我的代码无法正常工作。例如，如果输出为150177410，我的代码输出150177408。在过去的5 小时内，我一直在尝试调试它。任何帮助将不胜感激。下面是代码：

//only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied
//Works for arbitrary size array of type 2^x


#include<stdio.h>

__global__ void computeAddShared(int *in , int *out, int sizeInput){
    //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
    extern __shared__ float temp[];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int ltid = threadIdx.x;
    temp[ltid] = 0;
    while(tid < sizeInput){
        temp[ltid] += in[tid];
        tid+=gridDim.x * blockDim.x; // to handle array of any size
    }
    __syncthreads();
    int offset = 1;
    while(offset < blockDim.x){
        if(ltid % (offset * 2) == 0){
            temp[ltid] = temp[ltid] + temp[ltid + offset];
        }
        __syncthreads();
        offset*=2;
    }
    if(ltid == 0){
        out[blockIdx.x] = temp[0];
    }

}

int main(){



    int N = 8192;//should be 2^sth
    int size = N;
    int *a = (int*)malloc(N * sizeof(int));
    /* TO create random number
    FILE *f;
        f = fopen("invertedList.txt" , "w");
        a[0] = 1 + (rand() % 8);
        fprintf(f, "%d,",a[0]);
        for( int i = 1 ; i< N; i++){
            a[i] = a[i-1] + (rand() % 8) + 1;
            fprintf(f, "%d,",a[i]);
        }
        fclose(f);
        return 0;*/
    FILE *f;
    f = fopen("invertedList.txt","r");
    if( f == NULL){
            printf("File not found\n");
            system("pause");
            exit(1);
    }
    int count = 0 ;
    long actualSum = 0;
    for( int i =0 ; i < N ; i++){
        fscanf(f, "%d,", &a[count]);
        actualSum+=a[count];
        count++;
    }
    fclose(f);
    printf("The actual sum is %d\n",actualSum);
    int* gidata;
    int* godata;
    cudaMalloc((void**)&gidata, N* sizeof(int));
    cudaMemcpy(gidata,a, size * sizeof(int), cudaMemcpyHostToDevice);
    int TPB  = 256;
    int blocks = 10; //to get things kicked off
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    while(blocks != 1 ){
        if(size < TPB){
            TPB  = size; // size is 2^sth
        }
        blocks  = (size+ TPB -1 ) / TPB;
        cudaMalloc((void**)&godata, blocks * sizeof(int));
        computeAddShared<<<blocks, TPB,TPB*sizeof(int)>>>(gidata, godata,size);
        //cudaFree(gidata);
        gidata = godata;
        size = blocks;
    }
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));


    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop);
    printf("time is %f ms", elapsedTime);
    int *output = (int*)malloc(sizeof(int));
    cudaMemcpy(output, gidata,size *  sizeof(int), cudaMemcpyDeviceToHost);
    //Cant free either earlier as both point to same location
    cudaError_t chk = cudaFree(godata);
    if(chk!=0){
        printf("First chk also printed error. Maybe error in my logic\n");
    }

    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
    printf("The sum of the array is %d\n", output[0]);
    getchar();

    return 0;
}

【问题讨论】：

缩小输入域会发生什么？错误模式可能有助于诊断。
是否真的有必要发布依赖于我们无权访问的外部文件的代码，其中包括 10 行冗余注释掉的代码和 130 多行仅包含无意义 cmets 的列宽行?
我很惊讶它给出了任何答案！您正在检查 offset < blockDim.x，因为它应该是 ltid+offset < blockDim.x。在第二个 while 循环中，您正在使用不属于您的共享内存。编辑：动态__shared__ 大小不会有任何好处，因为您使用的是temp[ltid] = 0; 所以只有具有块大小的共享内存是有效的。
我认为归约内核本身虽然非正统，但实际上是可以的，就像调用内核的看起来相当奇怪的主机循环一样（你只需要像这样调用任何归约两次 i>，那么为什么要使用循环？）。但是在cudaMemcpy(output, gidata,size * sizeof(int), cudaMemcpyDeviceToHost); 中至少存在主机端缓冲区溢出以及可能的其他错误，但是由于没有人可以实际运行此代码，我不确定您如何期望任何人能够真正回答这个问题...

标签： cuda gpu

【解决方案1】：

正如 talonmies 之前所说，内核本身是可以的。基本上，它是在将reduce5 内核改进为reduce6 时使用的Brent 优化 意义上的算法级联 改进的CUDA SDK 的reduce0 内核相同的 SDK。

内核工作正常可以通过下面的测试代码来证明，该代码还将reduce0的性能与OP的内核的性能进行了比较，代码中命名为reduce0_stackoverflow。 reduce0_stackoverflow 内核还报告、注释了reduce0 的相应代码行。

对于下面的测试用例，reduce0_stackoverflow 在 0.030ms 中执行，而在 GeForce GT540M 卡上 0.049ms 执行。

请注意，下面的代码并不要求数组大小必须是2 的幂。

#include <thrust\device_vector.h>

#define BLOCKSIZE 256

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

/*************************************/
/* CHECK IF A NUMBER IS A POWER OF 2 */
/*************************************/
bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

/******************/
/* REDUCE0 KERNEL */
/******************/
/* This reduction interleaves which threads are active by using the modulo
    operator.  This operator is very expensive on GPUs, and the interleaved
    inactivity means that no whole warps are active, which is also very
    inefficient */
template <class T>
__global__ void reduce0(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid    = threadIdx.x;                              // Local thread index
    unsigned int i      = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    // --- Loading data to shared memory
    sdata[tid] = (i < N) ? g_idata[i] : 0;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory
    for (unsigned int s=1; s < blockDim.x; s *= 2)
    {
        // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.       
        if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/***************************/
/* REDUCE0 - STACKOVERFLOW */
/***************************/
template <class T>
__global__ void reduce0_stackoverflow(T *g_idata , T *g_odata, unsigned int N) {

    extern __shared__ T sdata[];

    unsigned int tid    = threadIdx.x;                              // Local thread index
    unsigned int i      = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    // --- Loading data to shared memory
    //sdata[tid] = (i < N) ? g_idata[i] : 0;                        // CUDA SDK
    sdata[tid] = 0;
    while(i < N){
        sdata[tid] += g_idata[i];
        i+=gridDim.x * blockDim.x; // to handle array of any size
    }

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory
    //  for (unsigned int s=1; s < blockDim.x; s *= 2)                  // CUDA SDK
    //  {
    //     if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }
    //     __syncthreads();
    //  }
    unsigned int s = 1;
    while(s < blockDim.x) 
    {
        // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.       
        if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }

        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();

        s*=2;
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 15336;

    thrust::device_vector<int> d_vec(N,3);

    int NumThreads  = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
    int NumBlocks   = (N + NumThreads - 1) / NumThreads;

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);

    // --- Creating events for timing
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<int> d_vec_block(NumBlocks);

    /***********/
    /* REDUCE0 */
    /***********/
    cudaEventRecord(start, 0);
    reduce0<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("reduce0 - Elapsed time:  %3.3f ms \n", time);   

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    thrust::host_vector<int> h_vec_block(d_vec_block);
    int sum_reduce0 = 0;
    for (int i=0; i<NumBlocks; i++) sum_reduce0 = sum_reduce0 + h_vec_block[i];
    printf("Result for reduce0 = %i\n",sum_reduce0);

    /**********************************/
    /* REDUCE0 KERNEL - STACKOVERFLOW */
    /**********************************/
    cudaEventRecord(start, 0);
    reduce0_stackoverflow<<<NumBlocks/2, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("reduce0 - stackoverflow - Elapsed time:  %3.3f ms \n", time);   

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    h_vec_block = d_vec_block;
    int sum_reduce0_stackoverflow = 0;
    for (int i=0; i<NumBlocks; i++) sum_reduce0_stackoverflow = sum_reduce0_stackoverflow + h_vec_block[i];
    printf("Result for reduce0_stackoverflow = %i\n",sum_reduce0_stackoverflow);

    getchar();
}

【讨论】：