【问题标题】:CUDA memory allocation issues with large images大图像的 CUDA 内存分配问题
【发布时间】:2013-02-04 15:22:54
【问题描述】:

我有一个从图像制作直方图的功能(给定的顺序版本(家庭作业))

CImg< unsigned char > histogramImage = CImg< unsigned char >(BAR_WIDTH * HISTOGRAM_SIZE, HISTOGRAM_SIZE, 1, 1);
unsigned int *histogram;
histogram = (unsigned int *)malloc(HISTOGRAM_SIZE * sizeof(unsigned int));
 memset(reinterpret_cast< void * >(histogram), 0, HISTOGRAM_SIZE * sizeof(unsigned int));

cudaMemset(gpuImage, 0, grayImage.width() * grayImage.height() * sizeof(unsigned char));

cuda_err = cudaMemcpy(gpuImage, grayImage, grayImage.width() * grayImage.height() * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cuda_err != cudaSuccess)
{
    std::cout << "ERROR: Failed cudaMemcpy" << std::endl;
   return -1;
}

unsigned int *gpuhistogram;
cuda_err = cudaMalloc((void **)(&gpuhistogram), HISTOGRAM_SIZE * sizeof(unsigned int));
if (cuda_err != cudaSuccess)
{
    std::cout << "ERROR: Failed cudaMalloc" << std::endl;
}
cudaMemset (gpuhistogram, 0, HISTOGRAM_SIZE * sizeof(unsigned int));

histogram1D(gpuImage, histogramImage, grayImage.width(), grayImage.height(), gpuhistogram, HISTOGRAM_SIZE, BAR_WIDTH, total, gridSize, blockSize);

cuda_err = cudaMemcpy(histogram, gpuhistogram, HISTOGRAM_SIZE * sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (cuda_err != cudaSuccess)
{
    std::cout << "ERROR: Failed cudaMemcpy" << std::endl;
}

那叫

void histogram1D(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH, NSTimer &timer, dim3 grid_size, dim3 block_size) {

NSTimer kernelTime = NSTimer("kernelTime", false, false);

kernelTime.start();
histo <<< grid_size, block_size >>> (grayImage, histogram,width);
cudaDeviceSynchronize();
kernelTime.stop();

cout << fixed << setprecision(6);
cout << "histogram1D (kernel): \t\t" << kernelTime.getElapsed() << " seconds." << endl;
}

核函数是

__global__ void histo(unsigned char *inputImage, unsigned int *histogram, int width)
{

int x = threadIdx.x + (blockIdx.x * blockDim.x);
int y = threadIdx.y + (blockIdx.y * blockDim.y);

unsigned int index = static_cast< unsigned int >(inputImage[(y * width) + x]);
atomicAdd(&histogram[index],1);
}

我遇到的问题是,当我用 1024x1024 到 3543x2480 的图像调用它时,它可以工作。但是,我有一张 8192x8192 的图像,当函数返回时,* histogram 中的值仍然为 0。我的试验似乎表明它与 *gpuhistogram 的内存分配有关(unsigned int 不应该足够大?)因为这个工作的顺序版本。如何解决这个问题?有什么想法吗?

【问题讨论】:

  • 将每个命令包装在 cudaSafeCall 中
  • 查看cudaDeviceSynchronize()的返回值
  • cudaDeviceSynchronize() 返回 cudaSuccess。立即实施 cudaSafeCall
  • @Mikhail,实现了安全调用。任何一行都没有错误。在内核调用后也在做 cudaGetLastError() 。仍然没有错误
  • @Mikhail 以防万一,“没有错误”,我的意思是问题仍然存在,没有报告任何来自调用的错误

标签: c++ cuda


【解决方案1】:
  1. 检查您的卡。来自维基百科:

    技术规格计算能力(版本) 1.0 1.1 1.2 1.3 2.x 3.0 3.5 线程块网格的最大维数 2 3 线程块网格的最大 x、y 或 z 维度 65535 231-1

  2. 我怀疑您的直方图的性能会比 CPU 代码差,尝试使用共享内存之类的东西并假设 256 个值。诀窍是每个块使用 bin# of threads(每个块 256 个线程)。我不想破坏作者的收入,请参阅CUDA by Example 2010

【讨论】:

  • GTX480,版本 - 2.x,最大 DIM - 3,最大 xyz - 65535
  • @masoftheund 无论如何我认为你的代码很糟糕,我会在继续之前修复它。
  • 好的。现在尝试这样做。不过真的没有太多时间来写“完美”的代码
  • 好书!实现了使用共享内存。不能说它的性能比我已经拥有的更好,而且对于大图像我仍然得到错误的结果(共享内存实现需要超过 16 秒!!)
  • 问题出在网格和块的尺寸上。我在我的网格中使用(宽度*高度/线程),对于那个文件,它正好是 65536,因为我每个块只能使用 1024 个线程。现在使用dim3 blockSize(16, 16, 1); dim3 gridSize((inputImage.width() + blockSize.x -1)/blockSize.x,(inputImage.height() + blockSize.y - 1) / blockSize.y, 1); 注意:虽然仍然不完全理解它
【解决方案2】:

只是想补充;按照米哈伊尔的回答,这就是我现在正在做的事情;

void histogram1D(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH, NSTimer &timer, dim3 grid_size, dim3 block_size) {

NSTimer kernelTime = NSTimer("kernelTime", false, false);


kernelTime.start();
// Kernel
histo <<< 15*2, 256 >>> (grayImage, histogram,width,height);//15 is the number of blocks for my device
//cudaDeviceSynchronize(); //i get slow results with this. figured it's not nessesary since the kernel threads are synced.
kernelTime.stop();

cout << fixed << setprecision(6);
cout << "histogram1D (kernel): \t\t" << kernelTime.getElapsed()*1000 << " milliseconds." << endl;
}

内核代码;

__global__ void histo(unsigned char *inputImage, unsigned int *histogram, int width, int height)
{
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;

__syncthreads();

int i = threadIdx.x + blockIdx.x * blockDim.x;
int offset = blockDim.y * gridDim.x;
while(i<width*height)
{
    atomicAdd(&temp[inputImage[i]],1);
    i += offset;
}

__syncthreads();
atomicAdd(&(histogram[threadIdx.x]),temp[threadIdx.x]);
}

【讨论】:

    猜你喜欢
    • 2011-08-24
    • 2013-07-26
    • 1970-01-01
    • 1970-01-01
    • 2010-12-30
    • 2010-09-22
    • 1970-01-01
    • 2012-11-30
    • 1970-01-01
    相关资源
    最近更新 更多