尝试使用 cuda 进行图像模糊答案

【问题标题】：Trying image blurring with cuda尝试使用 cuda 进行图像模糊
【发布时间】：2021-05-01 22:12:53
【问题描述】：

我正在尝试使用 stbi_image 作为库来使用 cuda 模糊图像以加载和保存图像。编译代码时没有错误，但是当我尝试查看结果时，它只是一个空白图像。那是代码。

#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;

    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);

  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);

  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);

  return 0;
}

我尝试了所有可能的方法，但我无法找到我卡在的地方。我尝试以串行方式执行此操作，它完美地使用相同的逻辑（我的意思是处理每个通道上的模糊）。希望有人可以帮助我

【问题讨论】：

你的内核有运行时错误吗？
如果您的原始图像有 4 个通道，您可能需要复制您的 Alpha 通道。
我没有运行时错误，我还尝试使用 alpha 通道将其设置为最大值，以便为图像提供非透明背景
看起来不错，如果出现小错误，它至少应该产生一些东西。尝试将刚刚加载的图像保存到新文件并检查它。然后只在 CUDA 中复制输入输出...

标签： c++ cuda gpu gpgpu

【解决方案1】：

您的代码的一个问题是您没有在内核代码（或其他任何地方）中将 alpha 通道从输入图像复制（或设置）到输出图像。 Alpha 通道实际上是未初始化的。如果它恰好为零，那么无论其他通道如何，您都不会在输出图片中看到任何有趣的内容。

当我这样修复你的代码时：

#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2
#define A 3

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel, int copy_A) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;
    if (copy_A)
      out[row*width*num_channel+col*num_channel+A] = in[row*width*num_channel+col*num_channel+A];
    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B,1);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);
  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);
  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);


  return 0;
}

并使用this picture 编译并运行它：

我得到一个如下所示的输出图片：

【讨论】：