【发布时间】:2013-02-04 15:22:54
【问题描述】:
我有一个从图像制作直方图的功能(给定的顺序版本(家庭作业))
CImg< unsigned char > histogramImage = CImg< unsigned char >(BAR_WIDTH * HISTOGRAM_SIZE, HISTOGRAM_SIZE, 1, 1);
unsigned int *histogram;
histogram = (unsigned int *)malloc(HISTOGRAM_SIZE * sizeof(unsigned int));
memset(reinterpret_cast< void * >(histogram), 0, HISTOGRAM_SIZE * sizeof(unsigned int));
cudaMemset(gpuImage, 0, grayImage.width() * grayImage.height() * sizeof(unsigned char));
cuda_err = cudaMemcpy(gpuImage, grayImage, grayImage.width() * grayImage.height() * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cuda_err != cudaSuccess)
{
std::cout << "ERROR: Failed cudaMemcpy" << std::endl;
return -1;
}
unsigned int *gpuhistogram;
cuda_err = cudaMalloc((void **)(&gpuhistogram), HISTOGRAM_SIZE * sizeof(unsigned int));
if (cuda_err != cudaSuccess)
{
std::cout << "ERROR: Failed cudaMalloc" << std::endl;
}
cudaMemset (gpuhistogram, 0, HISTOGRAM_SIZE * sizeof(unsigned int));
histogram1D(gpuImage, histogramImage, grayImage.width(), grayImage.height(), gpuhistogram, HISTOGRAM_SIZE, BAR_WIDTH, total, gridSize, blockSize);
cuda_err = cudaMemcpy(histogram, gpuhistogram, HISTOGRAM_SIZE * sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (cuda_err != cudaSuccess)
{
std::cout << "ERROR: Failed cudaMemcpy" << std::endl;
}
那叫
void histogram1D(unsigned char *grayImage, unsigned char *histogramImage, const int width, const int height, unsigned int *histogram, const unsigned int HISTOGRAM_SIZE, const unsigned int BAR_WIDTH, NSTimer &timer, dim3 grid_size, dim3 block_size) {
NSTimer kernelTime = NSTimer("kernelTime", false, false);
kernelTime.start();
histo <<< grid_size, block_size >>> (grayImage, histogram,width);
cudaDeviceSynchronize();
kernelTime.stop();
cout << fixed << setprecision(6);
cout << "histogram1D (kernel): \t\t" << kernelTime.getElapsed() << " seconds." << endl;
}
核函数是
__global__ void histo(unsigned char *inputImage, unsigned int *histogram, int width)
{
int x = threadIdx.x + (blockIdx.x * blockDim.x);
int y = threadIdx.y + (blockIdx.y * blockDim.y);
unsigned int index = static_cast< unsigned int >(inputImage[(y * width) + x]);
atomicAdd(&histogram[index],1);
}
我遇到的问题是,当我用 1024x1024 到 3543x2480 的图像调用它时,它可以工作。但是,我有一张 8192x8192 的图像,当函数返回时,* histogram 中的值仍然为 0。我的试验似乎表明它与 *gpuhistogram 的内存分配有关(unsigned int 不应该足够大?)因为这个工作的顺序版本。如何解决这个问题?有什么想法吗?
【问题讨论】:
-
将每个命令包装在 cudaSafeCall 中
-
查看cudaDeviceSynchronize()的返回值
-
cudaDeviceSynchronize() 返回 cudaSuccess。立即实施 cudaSafeCall
-
@Mikhail,实现了安全调用。任何一行都没有错误。在内核调用后也在做 cudaGetLastError() 。仍然没有错误
-
@Mikhail 以防万一,“没有错误”,我的意思是问题仍然存在,没有报告任何来自调用的错误