CUDA缩减以找到数组的最大值答案

【问题标题】：CUDA reduction to find the maximum of an arrayCUDA缩减以找到数组的最大值
【发布时间】：2014-06-29 11:59:51
【问题描述】：

我正在学习有关并行编程的 Udacity 课程（作业 3），但无法弄清楚为什么我无法使用并行归约来获得数组中的最大值（Udacity 论坛尚未提供解决方案）。我很确定我已经正确设置了数组并且算法是正确的。我怀疑我的内存管理有问题（访问越界、数组大小不正确、相互复制）。请帮忙！我在 Udacity 环境中运行它，而不是在本地运行。下面是我目前正在使用的代码。出于某种原因，当我将 fmaxf 更改为 fminf 时，它确实找到了最小值。

#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>

__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {

    // Reduce log Lum with Max Operator
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    extern __shared__ float temp[];

    if (myId < size) {
        temp[tid] = d_logLum[myId];
    }
    else {
        temp[tid] = d_logLum[tid];
    }

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            if (myId < size) {
                temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
            } else {
                temp[tid] = d_logLum[tid];
            }
        }
        __syncthreads(); 
    }

    if (tid == 0) {
        d_out[blockIdx.x] = temp[0];
    }
}

__global__ void reduce_max_kernel2(float *d_out, float *d_in) {

    // Reduce log Lum with Max Operator
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
        if (tid < s) {
            d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
        }
        __syncthreads();   
    }

    if (tid == 0) {
        d_out[0] = d_in[0];
    }

}


void your_histogram_and_prefixsum(const float* const d_logLuminance,
                                  unsigned int* const d_cdf,
                                  float &min_logLum,
                                  float &max_logLum,
                                  const size_t numRows,
                                  const size_t numCols,
                                  const size_t numBins)
{
  //TODO
  /*Here are the steps you need to implement
    1) find the minimum and maximum value in the input logLuminance channel
       store in min_logLum and max_logLum
    2) subtract them to find the range
    3) generate a histogram of all the values in the logLuminance channel using
       the formula: bin = (lum[i] - lumMin) / lumRange * numBins
    4) Perform an exclusive scan (prefix sum) on the histogram to get
       the cumulative distribution of luminance values (this should go in the
       incoming d_cdf pointer which already has been allocated for you)       */
    //int size = 1 << 18;
    int points = numRows * numCols;
    int logPoints = ceil(log(points)/log(2));
    int sizePow = logPoints;
    int size = pow(2, sizePow);
    int numThreads = 1024;
    int numBlocks = size / numThreads;

    float *d_out;
    float *d_max_out;

    checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));

    cudaDeviceSynchronize();
    reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);

    cudaDeviceSynchronize();
    reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);

    float h_out_max;
    checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));

    printf("%f\n", h_out_max);

    checkCudaErrors(cudaFree(d_max_out));
    checkCudaErrors(cudaFree(d_out));

}

【问题讨论】：

如果您需要帮助调试您的代码，我们需要查看其他人可以编译和运行的最短完整示例。
在您的reduce_max_kernel 中，通常建议在填充或修改共享内存后使用__syncthreads()。您在 reduce for 循环中拥有它，但在初始加载共享内存后（在 for 循环之前）您似乎没有。在其他线程有机会进行共享内存的初始加载之前，一些线程可能会提前进入 for 循环。此外，使用共享内存的目的是在您的 reduce-for-loop 中实际使用它。现在，您的 for 循环正在从全局而不是共享内存中加载值。那是坏的。

标签： c arrays cuda parallel-processing

【解决方案1】：

您正在尝试重现 CUDA SDK 缩减示例的 reduce2 缩减内核。 Robert Crovella 已经发现了您在代码中犯的两个错误。除此之外，我认为您还错误地初始化了共享内存。

请在下面找到一个围绕您的尝试构建的完整工作示例。我留下了错误的方法说明。

#include <thrust\device_vector.h>

#define BLOCKSIZE 256

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {

    int tid         = threadIdx.x;                              // Local thread index
    int myId        = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    extern __shared__ float temp[];

    // --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
    temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;

    // --- Your solution
    // if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s=blockDim.x/2; s>0; s>>=1)
    {
        if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Your solution
    //for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
    //    if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
    //    __syncthreads(); 
    //}

    if (tid == 0) {
        d_out[blockIdx.x] = temp[0];
    }
}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 10;

    thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;

    int NumThreads  = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
    int NumBlocks   = (N + NumThreads - 1) / NumThreads;

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);

    // --- reduce2
    thrust::device_vector<float> d_vec_block(NumBlocks);
    reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    thrust::host_vector<float> h_vec_block(d_vec_block);
    float result_reduce0 = -FLT_MAX;
    for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
    printf("Result = %f\n",result_reduce0);

}

【讨论】：