【发布时间】:2017-09-04 08:22:36
【问题描述】:
这个问题来自this question。 从那时起,我一直在研究我的 IDCT 内核以使其更高效。
这个版本的内核产生正确的输出但速度很慢:
void idct_1D(__local int *Y);
__kernel void IDCT(__global int* input, __global uchar* output)
{
unsigned int kid= get_global_id(0);
__local int Y[64];
int k,l;
__local int Yc[8];
for (k = 0; k < 8; k++)
{
for (l = 0; l < 8; l++)
{
Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);
}
idct_1D(&Y(k,0));
}
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}
idct_1D(Yc);
for (k = 0; k < 8; k++)
{
int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
我试图通过以这种方式修改它的结构来使这个内核更加并行:
__kernel void IDCT(__global int* input, __global uchar* output)
{
unsigned int kid= get_global_id(0);
__local int Y[64];
int k= get_global_id(0);
int l;
int lid= get_global_id(1);
__local int Yc[8];
if (k < 8)
{
for (l = 0; l < 8; l++)
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
idct_1D(&Y(k, 0));
}
if (lid < 8)
{
for (k = 0; k < 8; k++)
{
Yc[k] = Y(k, lid);
}
idct_1D(Yc);
for (k = 0; k < 8; k++)
{
int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, lid) = r;
}
}
}
上面的内核给了我正确的输出,但我发现我的代码处理速度没有变化。
我从中调用代码的main.c 如下所示:
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++) {
uint32_t component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);
//Output buffer
cl_mem DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
chk(ret, "clCreateBuffer");
ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
const size_t globalForInverseDCT[2]= {8, 8};
ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 2, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);
//Timing-End..
ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);
}
//other function
}
//code continues...
如何进一步优化这个内核?
编辑:
理想情况下,我希望将IDCT 分解成更小的内核。感谢我在this question 收到的帮助,常规功能被分解成更小的子功能,如下所示:
void IDCTforX(int32_t *input, uint8_t *output) {
int32_t Y[64];
int32_t k, l;
int32_t Yc[8];
for (k = 0; k < 8; k++) {
for (l = 0; l < 8; l++)
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
}
}
void IDCTfor1dim(int32_t *input, uint8_t *output)
{
int32_t Y[64];
int32_t k, l;
int32_t Yc[8];
for (k= 0; k < 8; k++)
{
idct_1d(&Y(k, 0));
}
}
在 Y 方向:
void IDCTforY(int32_t *input, uint8_t *output)
{
int32_t Y[64];
int32_t k, l;
int32_t Yc[8][8];
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
Yc[l][k] = Y(k, l);
idct_1d(Yc[l]);
}
void IDCT_DescaleY(int32_t *input, uint8_t *output)
{
int32_t Y[64];
int32_t k, l;
int32_t Yc[8][8];
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{
int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
在main.c 中,当我按此顺序调用函数(代替原始函数)时,我得到了正确的输出:
IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCT_DescaleY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
我将这些函数翻译成内核,但我只得到一个灰屏。我的新内核如下所示:
/*---------------IDCTForX----------------------------*/
__kernel void IDCTforX(__global int *input, __global uchar *output) {
int Y[64];
unsigned int k= get_global_id(0);
unsigned int l= get_global_id(1);
if ((k < 8) && (l < 8))
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
}
/*---------------IDCTfor1Dim-------------------------*/
__kernel void IDCTfor1dim(__global int *input, __global uchar *output){
int Y[64];
unsigned int k= get_global_id(0);
for (k= 0; k < 8; k++)
{
idct_1D(&Y(k, 0));
}
}
/*---------------IDCTYSplit----------------------------*/
__kernel void IDCTYSplit(__global int *input, __global uchar *output) {
int Y[64];
int k= get_global_id(0);
int l= get_global_id(1);
int Yc[8][8];
if ((k < 8) && (l < 8))
{
Yc[l][k] = Y(k, l);
}
}
/*---------------IDCTY_Inverse----------------------------*/
__kernel void IDCTY_Inverse(__global int *input, __global uchar *output) {
int Y[64];
int l= get_global_id(0);
int Yc[8][8];
if (l < 8)
{
idct_1D(Yc[l]);
}
}
/*---------------IDCTY_Descale----------------------------*/
__kernel void IDCTY_Descale(__global int *input, __global uchar *output) {
int Y[64];
int l= get_global_id(0);
int k= get_global_id(1);
int Yc[8][8];
if ((l < 8) && (k < 8))
{
int r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
【问题讨论】:
标签: c opencl signal-processing