如何针对 GPU 优化我的 OpenCL 内核？答案

【问题标题】：How can I optimize my OpenCL kernel for the GPU?如何针对 GPU 优化我的 OpenCL 内核？
【发布时间】：2017-09-04 08:22:36
【问题描述】：

这个问题来自this question。 从那时起，我一直在研究我的 IDCT 内核以使其更高效。

这个版本的内核产生正确的输出但速度很慢：

void idct_1D(__local int *Y);

 __kernel void IDCT(__global int* input, __global uchar* output) 
{
 unsigned int kid= get_global_id(0);

 __local int Y[64]; 
 int k,l;
 __local int Yc[8];

 for (k = 0; k < 8; k++)
 {
  for (l = 0; l < 8; l++)
  {
   Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);     
  }         
 idct_1D(&Y(k,0));
 }

for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}

idct_1D(Yc);

for (k = 0; k < 8; k++)
{

int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}

}

}

我试图通过以这种方式修改它的结构来使这个内核更加并行：

__kernel void IDCT(__global int* input, __global uchar* output) 
{
unsigned int kid= get_global_id(0);

    __local int Y[64]; 
    int k= get_global_id(0);
    int l;
    int lid= get_global_id(1);
    __local int Yc[8];

   if (k < 8)
    {
        for (l = 0; l < 8; l++) 

     {
     Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
     }
        idct_1D(&Y(k, 0));
    }

    if (lid < 8)
    {

        for (k = 0; k < 8; k++)
    {
            Yc[k] = Y(k, lid);
    }

        idct_1D(Yc);

        for (k = 0; k < 8; k++)
        {
            int r = 128 + DESCALE(Yc[k], S_BITS + 3);
            r = r > 0 ? (r < 255 ? r : 255) : 0;
            X(k, lid) = r;
        }

    }
}

上面的内核给了我正确的输出，但我发现我的代码处理速度没有变化。

我从中调用代码的main.c 如下所示：

  for (index_X = 0; index_X < nb_MCU_X; index_X++) {

    for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

    for (index = 0; index < SOS_section.n; index++) {

     uint32_t component_index = component_order[index];

     int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

     for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {

    cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);

    //Output buffer
    cl_mem  DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
    chk(ret, "clCreateBuffer");

    ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
    ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);


const size_t globalForInverseDCT[2]= {8, 8};



 ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 2, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);

//Timing-End..

 ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);

 }

 //other function

 }

 //code continues...

如何进一步优化这个内核？

编辑：

理想情况下，我希望将IDCT 分解成更小的内核。感谢我在this question 收到的帮助，常规功能被分解成更小的子功能，如下所示：

void IDCTforX(int32_t *input, uint8_t *output) {

    int32_t Y[64];
    int32_t k, l;
    int32_t Yc[8];

    for (k = 0; k < 8; k++) {
        for (l = 0; l < 8; l++)
        {
            Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
        }
    }
}

void IDCTfor1dim(int32_t *input, uint8_t *output)
{
int32_t Y[64];
    int32_t k, l;
    int32_t Yc[8];

    for (k= 0; k < 8; k++)
    {
        idct_1d(&Y(k, 0));
    }
}

在 Y 方向：

void IDCTforY(int32_t *input, uint8_t *output) 
{
  int32_t Y[64];
  int32_t k, l;
  int32_t Yc[8][8];

  for (l = 0; l < 8; l++) 
  {
      for (k = 0; k < 8; k++)
          Yc[l][k] = Y(k, l);
      idct_1d(Yc[l]);
   }

void IDCT_DescaleY(int32_t *input, uint8_t *output) 
{
  int32_t Y[64];
  int32_t k, l;
  int32_t Yc[8][8];

for (l = 0; l < 8; l++) 
   {
       for (k = 0; k < 8; k++) 
       {   
           int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
           r = r > 0 ? (r < 255 ? r : 255) : 0;
           X(k, l) = r;
       }
   }
}

在main.c 中，当我按此顺序调用函数（代替原始函数）时，我得到了正确的输出：

IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCT_DescaleY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));

我将这些函数翻译成内核，但我只得到一个灰屏。我的新内核如下所示：

/*---------------IDCTForX----------------------------*/

__kernel void IDCTforX(__global int *input, __global uchar *output) {

    int Y[64];
    unsigned int k= get_global_id(0);
    unsigned int l= get_global_id(1);

        if ((k < 8) && (l < 8))
            {
                Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);

            }

}

/*---------------IDCTfor1Dim-------------------------*/

__kernel void IDCTfor1dim(__global int *input, __global uchar *output){

int Y[64];
unsigned int k= get_global_id(0);

    for (k= 0; k < 8; k++)
    {
        idct_1D(&Y(k, 0));
    }

}

/*---------------IDCTYSplit----------------------------*/

__kernel void IDCTYSplit(__global int *input, __global uchar *output) {

int Y[64];
      int k= get_global_id(0);
      int l= get_global_id(1);
      int Yc[8][8];

      if ((k < 8) && (l < 8))
      {
              Yc[l][k] = Y(k, l);

      }
}

/*---------------IDCTY_Inverse----------------------------*/

__kernel void IDCTY_Inverse(__global int *input, __global uchar *output) {

int Y[64];
      int l= get_global_id(0);
      int Yc[8][8];

 if (l < 8)
          {
          idct_1D(Yc[l]);
       }
}

/*---------------IDCTY_Descale----------------------------*/

__kernel void IDCTY_Descale(__global int *input, __global uchar *output) {

int Y[64];

    int l= get_global_id(0);

    int k= get_global_id(1);

      int Yc[8][8];

       if ((l < 8) && (k < 8))
           {
               int r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
               r = r > 0 ? (r < 255 ? r : 255) : 0;
               X(k, l) = r;
           }

}

【问题讨论】：

标签： c opencl signal-processing

【解决方案1】：

只有大约 64 个操作和 64 个项目缓冲区太小了。刚开始将数据传入/传出 GPU 的开销一直在占用。您不仅需要可并行化的东西，而且还需要每个 GPU 内核数百万到数十亿次操作。

【讨论】：

您不仅需要可并行化的东西，而且还需要每个 GPU 内核数百万到数十亿次操作。 我该如何实现呢？我正在翻译 MJPEG 解码的代码，据我所知，我不能使用那么多操作。
我可以将mjpeg文件（它的数据类型是FILE）加载到GPU的内存中吗？这可能会减少通信开销。