【发布时间】:2014-08-11 21:43:45
【问题描述】:
在OpenCL中,如果我想添加两个N维向量,全局工作组大小(globalSize)应该满足globalSize = ceil(N/localSize) * localSize,其中localSize是本地工作组大小。这个对吗?如果N = 1000,和localSize = 128,globalSize应该是1024?我们是否可以始终将globalSize 设置为localSize 的倍数并且大于需要的值?
我尝试了很多次,对一维问题效果很好。
但是,当涉及到 2d 问题时,例如,将两个维度为 m*n 和 n*p 的矩阵相乘,结果矩阵的阶数为 m*p,事情变得更加复杂。
我的设备上的最大工作组大小是128,所以我设置了localSize [2] = {16,8} 和
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
类似于一维的情况,但结果是错误的!
如果我设置localSize [2] = {1,128} 并相应地更改globalSize,我可以获得正确的结果。那么问题出在哪里?谁能告诉我为什么?
另外,我找出矩阵元素错误的索引。
在(i,j)i*p + j = n * some constant (n = 1,2,3...)处的结果似乎是错误的
为什么?
这是我的内核函数:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
然后就是宿主程序:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
【问题讨论】:
-
您的内核是否检查全局索引以确保它们在正确的范围内?
-
我有条件检查,如果索引超出适当的范围,那么我只是让函数返回而不进行任何计算。像 x = get_global_id(0), y = get_global_id(1);并且只有 x 在 0 和 m 之间,y 在 0 和 n 之间,然后函数进行计算,否则它只是返回。
标签: opencl