【发布时间】:2017-10-26 22:47:21
【问题描述】:
我的目标是将一组用 MATLAB 编写的代码转换为 CUDA C++,以便在 GPU 上进行并行处理。 这是我一直在尝试转换的 MATLAB 代码:
function [M] = iqm_czekanowski(img1, img2)
img1 = double(img1);
img2 = double(img2);
[R,C,K] = size(img1);
N2 = R*C;
SUM1 = zeros(R,C);
SUM2 = zeros(R,C);
MIN = min(img1,img2);
%display(size(MIN));
SUM = img1 + img2;
for k=1:K
SUM1 = SUM1 + MIN(:,:,k);
SUM2 = SUM2 + SUM(:,:,k);
end
SUM = (2 .* SUM1) ./ SUM2;
SUM(isnan(SUM))=0;
SUM = 1 - (SUM);
M = sum(sum(SUM)) / N2;
img1 和 img2 是从另一个脚本作为参数传递的 2 个 rgb 图像。为简化起见,我将 rgb 图像分成 3 个不同的通道 img_r、img_b 和 img_g,分别代表 r、b 和 g 平面。 这是引发错误的一段 CUDA C++ 代码:
__global__ void iqm(int *img_r, int *img_g, int *img_b, int *f_img_r, int *f_img_g, int *f_img_b, int *x, int *y, int *z, double *iqm_res){
int n = x[0] * y[0];
//mae and mse
double mae = 0, m1 = 0, m2 = 0, m3 = 0;
double mse = 0, mse1 = 0, mse2 = 0, mse3 = 0;
for (int i = 0; i < n; ++i){
m1 = m1 + abs(img_r[i] - f_img_r[i]);
mse1 = mse1 + pow((double)abs(img_r[i] - f_img_r[i]),2.0);
}
m1 = m1 / n;
mse1 = sqrt(mse1 / n);
for (int i = 0; i < n; ++i){
m2 = m2 + abs(img_g[i] - f_img_g[i]);
mse2 = mse2 + pow((double)abs(img_g[i] - f_img_g[i]),2.0);
}
m2 = m2 / n;
mse2 = sqrt(mse2 / n);
for (int i = 0; i < n; ++i){
m3 = m3 + abs(img_b[i] - f_img_b[i]);
mse3 = mse3 + pow((double)abs(img_b[i] - f_img_b[i]),2.0);
}
m3 = m3 / n;
mse3 = sqrt(mse3 / n);
mae = (m1 + m2 + m3) / z[0];
mse = (mse1 + mse2 + mse3) / z[0];
//iqm_res[0] = mae;
//iqm_res[1] = mse;
//czekanowski
int min_r[26730], min_g[26730], min_b[26730];
int sum_r[26730], sum_g[26730], sum_b[26730];
for (int i = 0; i < n; ++i){
if (img_r[i] <= f_img_r[i]){
min_r[i] = img_r[i];
}
else{
min_r[i] = f_img_r[i];
}
sum_r[i] = img_r[i] + f_img_r[i];
if (img_g[i] <= f_img_g[i]){
min_g[i] = img_g[i];
}
else{
min_g[i] = f_img_g[i];
}
sum_g[i] = img_g[i] + f_img_g[i];
if (img_b[i] <= f_img_b[i]){
min_b[i] = img_b[i];
}
else{
min_b[i] = f_img_b[i];
}
sum_b[i] = img_b[i] + f_img_b[i];
}
int sum1[26730], sum2[26730];
for (int i = 0; i < n; ++i){
sum1[i] = min_r[i] + min_g[i] + min_b[i];
sum2[i] = sum_r[i] + sum_g[i] + sum_b[i];
}
double sum[26730];
for (int i = 0; i < n; ++i){
if (sum2[i] == 0){
sum[i] = 1.0;
}
else{
sum[i] = 1 - (2 * sum1[i] / sum2[i]);
}
}
double czekanowski = 0;
for (int i = 0; i < n; ++i){
czekanowski += sum[i];
}
czekanowski /= (double)n;
//printf("%f",czekanowski);
iqm_res[0] = mae;
iqm_res[1] = mse;
iqm_res[2] = czekanowski;
}
前三个参数代表第一张图像的 r,g,b 通道,接下来的 3 个参数代表第二张图像的相同。代码的最后一行
iqm_res[2] = czekanowski;
是导致错误的那个。 这是我评论最后一行后得到的结果
iqm =
1.0595 1.9781 0.0065 0.9972 0.9995 0.2892 3.9219 1.3211
iqm_res =
1.0595 1.9781 0 0 0 0 0 0 0 0
以及我取消注释后得到的错误:
使用 parallel.gpu.CUDAKernel/feval 时出错 尝试启动内核时发生意外错误。 CUDA 错误是: CUDA_ERROR_INVALID_VALUE
iqm_main_demo 中的错误(第 59 行) [t1,t2,t3,t4,t5,t6,t7,t8,t9,iqm_res] = feval(k,img1_r,img1_g,img1_b,img2_r,img2_g,img2_b,x,y,z,iqm_res);
mse 和 mae 部分工作正常并且给出了正确的结果。 另外我想问一下,可用于上述情况的图像大小是否有任何限制。我拍摄了一张 1500x1200 的大图,结果出现了硬件错误。
编辑:包括内核调用在内的整个代码。
%iqm_main_demo
clear all;
img_sample=imread('onion.png');
gfilt = fspecial('gaussian');
filt_img = imfilter(img_sample, gfilt, 'replicate');
cnt=0;
iqm(cnt+1) = iqm_mae(img_sample, filt_img);
iqm(cnt+2) = iqm_mse(img_sample, filt_img);
iqm(cnt+3) = iqm_czekanowski(img_sample, filt_img);
iqm(cnt+4) = iqm_crosscorr(img_sample, filt_img);
iqm(cnt+5) = iqm_normcrosscorr(img_sample, filt_img);
iqm(cnt+6) = iqm_mas(img_sample, filt_img);
iqm(cnt+7) = iqm_spectralmagnitude(img_sample, filt_img);
iqm(cnt+8) = iqm_spectralphase(img_sample, filt_img);
%iqm(cnt+9) = iqm_hvs(img_sample, filt_img);
%iqm(cnt+10) = iqm_laplacianmse(img_sample, filt_img);
%cnt = cnt + 10;
iqm
k = parallel.gpu.CUDAKernel('demo.ptx','demo.cu');
k.ThreadBlockSize = [1 1 1];
img1_r = img_sample(:,:,1);
img1_g = img_sample(:,:,2);
img1_b = img_sample(:,:,3);
img2_r = filt_img(:,:,1);
img2_g = filt_img(:,:,2);
img2_b = filt_img(:,:,3);
[x,y,z]=size(img_sample);
img1_r = reshape(img1_r',[1 x*y]);
img1_g = reshape(img1_g',[1 x*y]);
img1_b = reshape(img1_b',[1 x*y]);
img2_r = reshape(img2_r',[1 x*y]);
img2_g = reshape(img2_g',[1 x*y]);
img2_b = reshape(img2_b',[1 x*y]);
img1_r = gpuArray(int32(img1_r));
img1_g = gpuArray(int32(img1_g));
img1_b = gpuArray(int32(img1_b));
img2_r = gpuArray(int32(img2_r));
img2_g = gpuArray(int32(img2_g));
img2_b = gpuArray(int32(img2_b));
x = gpuArray(int32(x));
y = gpuArray(int32(y));
z = gpuArray(int32(z));
iqm_res = gpuArray(zeros(1,10));
[t1,t2,t3,t4,t5,t6,t7,t8,t9,iqm_res] =
feval(k,img1_r,img1_g,img1_b,img2_r,img2_g,img2_b,x,y,z,iqm_res);
iqm_res
【问题讨论】:
-
没有minimal reproducible example 就无法回答您的问题。
-
那段 cuda 代码是整个内核吗?如果是这样的话,在很多层面上都是错误的。还请放 mex 代码和内核调用
-
是的,它是整个内核。我是 CUDA 新手,对它没有太多经验。请指出问题所在。
-
我真的可以使用技巧来改进我的程序。我只通过观看网络教程来做到这一点。
-
我不知道如何编写 mex 代码。我按照这个使用 cu 和 ptx 文件的示例 in.mathworks.com/help/distcomp/run-cuda-or-ptx-code-on-gpu.html 编写了上面的代码